diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 0000000..004019d --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,78 @@ + +version: "{branch}.build.{build}" + +clone_folder: c:\projects\simdcomp + +#cache: +# c:\build-cache -> .appveyor.yml + +environment: + matrix: + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + ARCH: x64 +# looks like vc14 has trouble with code on x86, at least on the AppVeyor image +# - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 +# ARCH: x86 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + ARCH: x64 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + ARCH: x86 + +build_script: + ps: | + cd c:\projects\simdcomp + echo "" | Out-File -Encoding "ASCII" task.bat + if ('Visual Studio 2015' -eq $env:APPVEYOR_BUILD_WORKER_IMAGE) { + $VC = 14; + $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" ' + $env:ARCH + ' 2>&1' + } elseif ('Visual Studio 2017' -eq $env:APPVEYOR_BUILD_WORKER_IMAGE) { + $VC = 15; + if ('x64' -eq $env:ARCH) { + $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" 2>&1' + } else { + $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars32.bat" 2>&1' + } + } + mkdir 'c:\tmp_pack' + echo $vs_shell_cmd | Out-File -Encoding "ASCII" -Append task.bat + $move_cmd = 'move *.zip c:\tmp_pack' + if ($VC -gt 14) { + # these won't be tested, just build and upload artifact, vc15 only + $cmd = 'nmake /nologo /f makefile.vc AVX512=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1' + echo $cmd | Out-File -Encoding "ASCII" -Append task.bat + $cmd = 'nmake /nologo /f makefile.vc AVX512=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1' + echo $cmd | Out-File -Encoding "ASCII" -Append task.bat + echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat + echo 'nmake /nologo /f makefile.vc clean' | Out-File -Encoding "ASCII" -Append task.bat + $cmd = 'nmake /nologo /f makefile.vc AVX2=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1' + echo $cmd | Out-File -Encoding "ASCII" -Append task.bat + $cmd = 'nmake /nologo /f makefile.vc AVX2=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1' + echo $cmd | Out-File -Encoding "ASCII" -Append task.bat + echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat + echo 'nmake /nologo /f makefile.vc clean' | Out-File -Encoding "ASCII" -Append task.bat + } + $cmd = 'nmake /nologo /f makefile.vc PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1' + echo $cmd | Out-File -Encoding "ASCII" -Append task.bat + $cmd = 'nmake /nologo /f makefile.vc PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1' + echo $cmd | Out-File -Encoding "ASCII" -Append task.bat + echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat + $here = (Get-Item -Path "." -Verbose).FullName + $task = $here + '\task.bat' + & $task + +after_build: + ps: | + Get-ChildItem 'c:\tmp_pack' -Filter *.zip | + Foreach-Object { + Push-AppveyorArtifact $_.FullName + } + +test_script: + ps: | + cd c:\projects\simdcomp + echo "" | Out-File -Encoding "ASCII" task.bat + $here = (Get-Item -Path "." -Verbose).FullName + echo '.\unit.exe' | Out-File -Encoding "ASCII" -Append task.bat + $task = $here + '\task.bat' + & $task + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b833428 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +Makefile.in +lib* +unit* +*.o +src/*.lo +src/*.o +src/.deps +src/.dirstamp +src/.libs diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..33de3a8 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,11 @@ +language: c +sudo: false +compiler: + - gcc + - clang + +branches: + only: + - master + +script: make && ./unit && ./unit_chars && make clean diff --git a/CHANGELOG b/CHANGELOG index f3c6a38..bf56786 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,9 @@ +Upcoming + - added missing include + - improved portability (MSVC) + - implemented C89 compatibility +Version 0.0.3 (19 May 2014) + - improved documentation Version 0.0.2 (6 February 2014) - added go demo Version 0.0.1 (5 February 2014) diff --git a/LICENSE b/LICENSE index 00580bb..f3c5904 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2014, Daniel Lemire +Copyright (c) 2014--, The authors All rights reserved. Redistribution and use in source and binary forms, with or without modification, diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..48ee108 --- /dev/null +++ b/Makefile @@ -0,0 +1,88 @@ +# minimalist makefile +.SUFFIXES: +# +.SUFFIXES: .cpp .o .c .h +ifeq ($(DEBUG),1) +CFLAGS = -fPIC -std=c89 -ggdb -march=native -Wall -Wextra -Wshadow -fsanitize=undefined -fno-omit-frame-pointer -fsanitize=address +else +CFLAGS = -fPIC -std=c89 -O3 -march=native -Wall -Wextra -Wshadow +endif # debug +LDFLAGS = -shared +LIBNAME=libsimdcomp.so.0.0.3 +STATICLIBNAME=libsimdcomp.a +all: unit unit_chars bitpackingbenchmark $(LIBNAME) $(STATICLIBNAME) +test: + ./unit + ./unit_chars +install: $(OBJECTS) + cp $(LIBNAME) /usr/local/lib + ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so + ldconfig + cp $(HEADERS) /usr/local/include + + + +HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h ./include/avx512bitpacking.h + +uninstall: + for h in $(HEADERS) ; do rm /usr/local/$$h; done + rm /usr/local/lib/$(LIBNAME) + rm /usr/local/lib/libsimdcomp.so + ldconfig + + +OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \ + simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o avx512bitpacking.o + +$(LIBNAME): $(OBJECTS) + $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) + +$(STATICLIBNAME): $(OBJECTS) + ar -qcs $@ $(OBJECTS) + ranlib $@ + +avx512bitpacking.o: ./src/avx512bitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/avx512bitpacking.c -Iinclude + + + +avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude + + +simdfor.o: ./src/simdfor.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude + + +simdcomputil.o: ./src/simdcomputil.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude + +simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude + +simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude + +simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude + +simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude + +example: ./example.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) + +unit: ./tests/unit.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS) + +bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude $(OBJECTS) +benchmark: ./benchmarks/benchmark.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude $(OBJECTS) +dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME) + $(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lsimdcomp + +unit_chars: ./tests/unit_chars.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude $(OBJECTS) +clean: + rm -f unit *.o $(LIBNAME) $(STATICLIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars diff --git a/README.md b/README.md index 16cd67c..198b39b 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,21 @@ -The SIMDComp library +The SIMDComp library ==================== +[![Build Status](https://img.shields.io/appveyor/ci/lemire/simdcomp.svg)](https://ci.appveyor.com/project/lemire/simdcomp) + A simple C library for compressing lists of integers using binary packing and SIMD instructions. +The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers. + +This library can decode at least 4 billions of compressed integers per second on most +desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s. +This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4. + +On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer, +which can easily translate into more than 8 decoded billions integers per second. + +This library is part of the [Awesome C](https://github.com/kozross/awesome-c) list of C resources. -This library can decode billions of compressed integers per second on most -desktop or laptop processors. +Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others What is it for? ------------- @@ -12,13 +23,23 @@ What is it for? This is a low-level library for fast integer compression. By design it does not define a compressed format. It is up to the (sophisticated) user to create a compressed format. +It is used by: +- [upscaledb](https://github.com/cruppstahl/upscaledb) +- [EventQL](https://github.com/eventql/eventql) +- [ManticoreSearch](https://manticoresearch.com) + + + Requirements ------------- -- Your processor should support SSE2 (Pentium4 or better) +- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.) +- It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better) - C99 compliant compiler (GCC is assumed) - A Linux-like distribution is assumed by the makefile +For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker + Usage ------- @@ -31,20 +52,62 @@ run it with "make example; ./example"). 1) Lists of integers in random order. - const uint32_t b = maxbits(datain);// computes bit width - simdpackwithoutmask(datain, buffer, b);//compressed to buffer - simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer +```C +const uint32_t b = maxbits(datain);// computes bit width +simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes +simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer +``` While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b. 2) Sorted lists of integers. We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset). - - uint32_t offset = 0; - uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width - simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressed - simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed + +```C +uint32_t offset = 0; +uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width +simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes +simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed +``` + +General example for arrays of arbitrary length: +```C +int compress_decompress_demo() { + size_t k, N = 9999; + __m128i * endofbuf; + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint8_t * buffer; + uint32_t * backbuffer = malloc(N * sizeof(uint32_t)); + uint32_t b; + + for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */ + datain[k] = k; + } + + b = maxbits_length(datain, N); + buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory + endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b); + /* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */ + /* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */ + simdunpack_length((const __m128i *)buffer, N, backbuffer, b); + + for (k = 0; k < N; ++k){ + if(datain[k] != backbuffer[k]) { + printf("bug\n"); + return -1; + } + } + return 0; +} +``` + + +3) Frame-of-Reference + +We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing +routines, but do not use differential coding so they allow faster search in some cases, at the expense +of compression. Setup --------- @@ -55,7 +118,7 @@ make test and if you are daring: -make install +make install Go -------- @@ -64,22 +127,37 @@ If you are a go user, there is a "go" folder where you will find a simple demo. Other libraries ---------------- +* Fast integer compression in Go: https://github.com/ronanh/intcomp +* Fast Bitpacking algorithms: Rust port of simdcomp https://github.com/quickwit-oss/bitpacking +* SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersection +* The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor +* High-performance dictionary coding https://github.com/lemire/dictionary +* LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker +* StreamVByte: Fast integer compression in C using the StreamVByte codec https://github.com/lemire/streamvbyte +* MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte +* CSharpFastPFOR: A C# integer compression library https://github.com/Genbox/CSharpFastPFOR +* JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR +* Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding +* FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference +* libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte +* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor +* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch + + +Other programming languages +------------- -FastPFOR is a C++ research library well suited to compress unsorted arrays: -https://github.com/lemire/FastPFor - -SIMDCompressionAndIntersection is a C++ research library well suited for sorted arrays (differential coding) -and computing intersections: -https://github.com/lemire/SIMDCompressionAndIntersection +- [There is a wrapper for Julia](https://github.com/mcovalt/TinyInt.jl). +- [There is a Rust port](https://github.com/tantivy-search/bitpacking/). References ------------ - - -Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software: Practice & Experience, 2013. -http://dx.doi.org/10.1002/spe.2203 - -Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the -Intersection of Sorted Integers -http://arxiv.org/abs/1401.6399 - +* Daniel Lemire, Nathan Kurz, Christoph Rupp, Stream VByte: Faster Byte-Oriented Integer Compression, Information Processing Letters, Information Processing Letters 130, February 2018, Pages 1-6https://arxiv.org/abs/1709.08990 +* Jianguo Wang, Chunbin Lin, Yannis Papakonstantinou, Steven Swanson, An Experimental Study of Bitmap Compression vs. Inverted List Compression, SIGMOD 2017 http://db.ucsd.edu/wp-content/uploads/2017/03/sidm338-wangA.pdf +* P. Damme, D. Habich, J. Hildebrandt, W. Lehner, Lightweight Data Compression Algorithms: An Experimental Survey (Experiments and Analyses), EDBT 2017 http://openproceedings.org/2017/conf/edbt/paper-146.pdf +* P. Damme, D. Habich, J. Hildebrandt, W. Lehner, Insights into the Comparative Evaluation of Lightweight Data Compression Algorithms, EDBT 2017 http://openproceedings.org/2017/conf/edbt/paper-414.pdf +* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399 +* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015. http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract +* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387 +* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916 +* T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5 diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c new file mode 100644 index 0000000..783ef94 --- /dev/null +++ b/benchmarks/benchmark.c @@ -0,0 +1,235 @@ +/** + * This code is released under a BSD License. + */ +#include +#include +#include +#include + +#include "simdcomp.h" + +#ifdef _MSC_VER +# include + +__int64 freq; + +typedef __int64 time_snap_t; + +static time_snap_t time_snap(void) +{ + __int64 now; + + QueryPerformanceCounter((LARGE_INTEGER *)&now); + + return (__int64)((now*1000000)/freq); +} +# define TIME_SNAP_FMT "%I64d" +#else +# define time_snap clock +# define TIME_SNAP_FMT "%lu" +typedef clock_t time_snap_t; +#endif + + +void benchmarkSelect() { + uint32_t buffer[128]; + uint32_t backbuffer[128]; + uint32_t initial = 33; + uint32_t b; + time_snap_t S1, S2, S3; + int i; + printf("benchmarking select \n"); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = initial; + uint32_t out[128]; + /* initialize the buffer */ + for (i = 0; i < 128; i++) { + buffer[i] = ((uint32_t)(1655765 * i )) ; + if(b < 32) buffer[i] %= (1< *ib) + return 1; + return 0; +} + +/* adapted from wikipedia */ +int binary_search(uint32_t * A, uint32_t key, int imin, int imax) +{ + int imid; + imax --; + while(imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] > key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; + } else { + return imid; + } + } + return imax; +} + + +/* adapted from wikipedia */ +int lower_bound(uint32_t * A, uint32_t key, int imin, int imax) +{ + int imid; + imax --; + while(imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] >= key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; + } + } + if(A[imin] >= key) return imin; + return imax; +} + +void benchmarkSearch() { + uint32_t buffer[128]; + uint32_t backbuffer[128]; + uint32_t out[128]; + uint32_t result, initial = 0; + uint32_t b, i; + time_snap_t S1, S2, S3, S4; + + printf("benchmarking search \n"); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = initial; + /* initialize the buffer */ + for (i = 0; i < 128; i++) { + buffer[i] = ((uint32_t)rand()) ; + if(b < 32) buffer[i] %= (1< 0) { + if(buffer[pos-1] >= pseudorandomkey) + printf("bug B.\n"); + } + } + S2 = time_snap(); + for (i = 0; i < 128 * 10; i++) { + int pos; + uint32_t pseudorandomkey = buffer[i%128]; + simdunpackd1(initial, (__m128i *)out, backbuffer, b); + pos = lower_bound(backbuffer, pseudorandomkey, 0, 128); + result = backbuffer[pos]; + + if((result < pseudorandomkey) || (buffer[pos] != result)) { + printf("bug C.\n"); + } else if (pos > 0) { + if(buffer[pos-1] >= pseudorandomkey) + printf("bug D.\n"); + } + } + S3 = time_snap(); + for (i = 0; i < 128 * 10; i++) { + + int pos; + uint32_t pseudorandomkey = buffer[i%128]; + pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128, + pseudorandomkey, &result); + if((result < pseudorandomkey) || (buffer[pos] != result)) { + printf("bug A.\n"); + } else if (pos > 0) { + if(buffer[pos-1] >= pseudorandomkey) + printf("bug B.\n"); + } + } + S4 = time_snap(); + + printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2), (S4-S3) ); + } +} + + +int main() { +#ifdef _MSC_VER + QueryPerformanceFrequency((LARGE_INTEGER *)&freq); +#endif + benchmarkSearch(); + benchmarkSelect(); + return 0; +} diff --git a/benchmarks/bitpackingbenchmark.c b/benchmarks/bitpackingbenchmark.c new file mode 100644 index 0000000..bcbe5d2 --- /dev/null +++ b/benchmarks/bitpackingbenchmark.c @@ -0,0 +1,259 @@ +#include +#include + +#include "simdcomp.h" + + +#define RDTSC_START(cycles) \ + do { \ + register unsigned cyc_high, cyc_low; \ + __asm volatile( \ + "cpuid\n\t" \ + "rdtsc\n\t" \ + "mov %%edx, %0\n\t" \ + "mov %%eax, %1\n\t" \ + : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + +#define RDTSC_FINAL(cycles) \ + do { \ + register unsigned cyc_high, cyc_low; \ + __asm volatile( \ + "rdtscp\n\t" \ + "mov %%edx, %0\n\t" \ + "mov %%eax, %1\n\t" \ + "cpuid\n\t" \ + : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + + + + +uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) { + uint32_t * answer = malloc(sizeof(uint32_t) * length); + uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1); + uint32_t i; + for(i = 0; i < length; ++i) { + answer[i] = rand() & mask; + } + return answer; +} + +uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) { + uint32_t * answer = malloc(sizeof(uint32_t) * length); + uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1); + uint32_t i; + answer[0] = rand() & mask; + for(i = 1; i < length; ++i) { + answer[i] = answer[i-1] + (rand() & mask); + } + return answer; +} + + +void demo128() { + const uint32_t length = 128; + uint32_t bit; + printf("# --- %s\n", __func__); + printf("# compressing %d integers\n",length); + printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); + for(bit = 1; bit <= 32; ++bit) { + uint32_t i; + + uint32_t * data = get_random_array_from_bit_width(length, bit); + __m128i * buffer = malloc(length * sizeof(uint32_t)); + uint32_t * backdata = malloc(length * sizeof(uint32_t)); + uint32_t repeat = 500; + uint64_t min_diff; + printf("%d\t",bit); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdpackwithoutmask(data,buffer, bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdunpack(buffer, backdata,bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + + free(data); + free(buffer); + free(backdata); + printf("\n"); + } + printf("\n\n"); /* two blank lines are required by gnuplot */ +} + +void demo128_d1() { + const uint32_t length = 128; + uint32_t bit; + printf("# --- %s\n", __func__); + printf("# compressing %d integers\n",length); + printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); + for(bit = 1; bit <= 32; ++bit) { + uint32_t i; + + uint32_t * data = get_random_array_from_bit_width_d1(length, bit); + __m128i * buffer = malloc(length * sizeof(uint32_t)); + uint32_t * backdata = malloc(length * sizeof(uint32_t)); + uint32_t repeat = 500; + uint64_t min_diff; + printf("%d\t",bit); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdpackwithoutmaskd1(0,data,buffer, bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdunpackd1(0,buffer, backdata,bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + + free(data); + free(buffer); + free(backdata); + printf("\n"); + } + printf("\n\n"); /* two blank lines are required by gnuplot */ +} + +#ifdef __AVX2__ +void demo256() { + const uint32_t length = 256; + uint32_t bit; + printf("# --- %s\n", __func__); + printf("# compressing %d integers\n",length); + printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); + for(bit = 1; bit <= 32; ++bit) { + uint32_t i; + + uint32_t * data = get_random_array_from_bit_width(length, bit); + __m256i * buffer = malloc(length * sizeof(uint32_t)); + uint32_t * backdata = malloc(length * sizeof(uint32_t)); + uint32_t repeat = 500; + uint64_t min_diff; + printf("%d\t",bit); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + avxpackwithoutmask(data,buffer, bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + avxunpack(buffer, backdata,bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + + free(data); + free(buffer); + free(backdata); + printf("\n"); + } + printf("\n\n"); /* two blank lines are required by gnuplot */ +} +#endif /* avx 2 */ + +#ifdef __AVX512F__ +void demo512() { + const uint32_t length = 512; + uint32_t bit; + size_t z; + printf("# --- %s\n", __func__); + printf("# compressing %d integers\n",length); + printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); + for(bit = 1; bit <= 32; ++bit) { + uint32_t i; + + uint32_t * data = get_random_array_from_bit_width(length, bit); + __m512i * buffer = malloc(length * sizeof(uint32_t)); + uint32_t * backdata = malloc(length * sizeof(uint32_t)); + uint32_t repeat = 500; + uint64_t min_diff; + printf("%d\t",bit); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + avx512packwithoutmask(data,buffer, bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + avx512unpack(buffer, backdata,bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + for(z = 0 ; z < length ; ++z) assert(backdata[z] == data[z]); + free(data); + free(buffer); + free(backdata); + printf("\n"); + } + printf("\n\n"); /* two blank lines are required by gnuplot */ +} +#endif /* avx 2 */ + + + +int main() { + demo128(); + demo128_d1(); +#ifdef __AVX2__ + demo256(); +#endif +#ifdef __AVX512F__ + demo512(); +#endif + return 0; + + +} diff --git a/example.c b/example.c index 0394e20..1d68f95 100644 --- a/example.c +++ b/example.c @@ -1,66 +1,195 @@ +/* Type "make example" to build this example program. */ #include #include +#include #include "simdcomp.h" +/** +We provide several different code examples. +**/ -// compresses data from datain to buffer, returns how many bytes written + +/* very simple test to illustrate a simple application */ +int compress_decompress_demo() { + size_t k, N = 9999; + __m128i * endofbuf; + int howmanybytes; + float compratio; + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint8_t * buffer; + uint32_t * backbuffer = malloc(N * sizeof(uint32_t)); + uint32_t b; + printf("== simple test\n"); + + for (k = 0; k < N; ++k) { /* start with k=0, not k=1! */ + datain[k] = k; + } + + b = maxbits_length(datain, N); + buffer = malloc(simdpack_compressedbytes(N,b)); + endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b); + howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */ + compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes; + /* endofbuf points to the end of the compressed data */ + buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */ + printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio); + /* in actual applications b must be stored and retrieved: caller is responsible for that. */ + simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */ + + for (k = 0; k < N; ++k) { + if(datain[k] != backbuffer[k]) { + printf("bug at %lu \n",(unsigned long)k); + return -1; + } + } + printf("Code works!\n"); + free(datain); + free(buffer); + free(backbuffer); + return 0; +} + + + +/* compresses data from datain to buffer, returns how many bytes written +used below in simple_demo */ size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) { - if(length/SIMDBlockSize*SIMDBlockSize != length) { - printf("Data length should be a multiple of %i \n",SIMDBlockSize); - } - uint32_t offset = 0; - uint8_t * initout = buffer; - for(size_t k = 0; k < length / SIMDBlockSize; ++k) { + uint32_t offset; + uint8_t * initout; + size_t k; + if(length/SIMDBlockSize*SIMDBlockSize != length) { + printf("Data length should be a multiple of %i \n",SIMDBlockSize); + } + offset = 0; + initout = buffer; + for(k = 0; k < length / SIMDBlockSize; ++k) { uint32_t b = simdmaxbitsd1(offset, - datain + k * SIMDBlockSize); - *buffer++ = b; - simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, - b); + datain + k * SIMDBlockSize); + *buffer++ = b; + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, + b); offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; buffer += b * sizeof(__m128i); - } - return buffer - initout; + } + return buffer - initout; } - -int main() { - int REPEAT = 5; - int N = 1000000 * SIMDBlockSize;//SIMDBlockSize is 128 +/* Another illustration ... */ +void simple_demo() { + size_t REPEAT = 10, gap; + size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */ uint32_t * datain = malloc(N * sizeof(uint32_t)); size_t compsize; clock_t start, end; - - uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); // output buffer + uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */ uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); - for (int gap = 1; gap <= 243; gap *= 3) { - printf("\n"); - printf(" gap = %u \n", gap); - for (int k = 0; k < N; ++k) - datain[k] = k * gap; + printf("== simple demo\n"); + for (gap = 1; gap <= 243; gap *= 3) { + size_t k, repeat; uint32_t offset = 0; + uint32_t bogus = 0; + double numberofseconds; + + printf("\n"); + printf(" gap = %lu \n", (unsigned long) gap); + datain[0] = 0; + for (k = 1; k < N; ++k) + datain[k] = datain[k-1] + ( rand() % (gap + 1) ); compsize = compress(datain,N,buffer); - printf("compression rate = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); + printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); start = clock(); - uint32_t bogus = 0; - for(int repeat = 0; repeat < REPEAT; ++repeat) { - uint8_t * decbuffer = buffer; - for (int k = 0; k * SIMDBlockSize < N; ++k) { - uint8_t b = *decbuffer++; - simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); - // do something here with backbuffer - bogus += backbuffer[3]; - decbuffer += b * sizeof(__m128i); - offset = backbuffer[SIMDBlockSize - 1]; - } + for(repeat = 0; repeat < REPEAT; ++repeat) { + uint8_t * decbuffer = buffer; + for (k = 0; k * SIMDBlockSize < N; ++k) { + uint8_t b = *decbuffer++; + simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); + /* do something here with backbuffer */ + bogus += backbuffer[3]; + decbuffer += b * sizeof(__m128i); + offset = backbuffer[SIMDBlockSize - 1]; + } } end = clock(); - double numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; + numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); + start = clock(); + for(repeat = 0; repeat < REPEAT; ++repeat) { + uint8_t * decbuffer = buffer; + for (k = 0; k * SIMDBlockSize < N; ++k) { + memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t)); + bogus += backbuffer[3] - backbuffer[100]; + } + } + end = clock(); + numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; + printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); printf("ignore me %i \n",bogus); + printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n"); } free(buffer); free(datain); free(backbuffer); +} + +/* Used below in more_sophisticated_demo ... */ +size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) { + uint8_t * initout; + size_t k; + if(length/SIMDBlockSize*SIMDBlockSize != length) { + printf("Data length should be a multiple of %i \n",SIMDBlockSize); + } + initout = buffer; + for(k = 0; k < length / SIMDBlockSize; ++k) { + uint32_t b = maxbits(datain); + *buffer++ = b; + simdpackwithoutmask(datain, (__m128i *)buffer, b); + datain += SIMDBlockSize; + buffer += b * sizeof(__m128i); + } + return buffer - initout; +} + +/* Here we compress the data in blocks of 128 integers with varying bit width */ +int varying_bit_width_demo() { + size_t nn = 128 * 2; + uint32_t * datainn = malloc(nn * sizeof(uint32_t)); + uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize); + uint8_t * initbuffern = buffern; + uint32_t * backbuffern = malloc(nn * sizeof(uint32_t)); + size_t k, compsize; + printf("== varying bit-width demo\n"); + + for(k=0; k +/* for memset */ +#include + +#include "simdcomputil.h" + +enum { AVX512BlockSize = 512 }; + +/* max integer logarithm over a range of AVX512BlockSize integers (512 integer) + */ +uint32_t avx512maxbits(const uint32_t *begin); + +/* reads 512 values from "in", writes "bit" 512-bit vectors to "out" */ +void avx512pack(const uint32_t *in, __m512i *out, const uint32_t bit); + +/* reads 512 values from "in", writes "bit" 512-bit vectors to "out" */ +void avx512packwithoutmask(const uint32_t *in, __m512i *out, + const uint32_t bit); + +/* reads "bit" 512-bit vectors from "in", writes 512 values to "out" */ +void avx512unpack(const __m512i *in, uint32_t *out, const uint32_t bit); + +#endif /* __AVX512F__ */ + +#endif /* INCLUDE_AVX512BITPACKING_H_ */ diff --git a/include/avxbitpacking.h b/include/avxbitpacking.h new file mode 100644 index 0000000..f1aefb6 --- /dev/null +++ b/include/avxbitpacking.h @@ -0,0 +1,35 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef INCLUDE_AVXBITPACKING_H_ +#define INCLUDE_AVXBITPACKING_H_ + +#ifdef __AVX2__ + +#include "portability.h" + +/* AVX2 is required */ +#include +/* for memset */ +#include + +#include "simdcomputil.h" + +enum { AVXBlockSize = 256 }; + +/* max integer logarithm over a range of AVXBlockSize integers (256 integer) */ +uint32_t avxmaxbits(const uint32_t *begin); + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpack(const uint32_t *in, __m256i *out, const uint32_t bit); + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpackwithoutmask(const uint32_t *in, __m256i *out, const uint32_t bit); + +/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */ +void avxunpack(const __m256i *in, uint32_t *out, const uint32_t bit); + +#endif /* __AVX2__ */ + +#endif /* INCLUDE_AVXBITPACKING_H_ */ diff --git a/include/portability.h b/include/portability.h new file mode 100644 index 0000000..e69c612 --- /dev/null +++ b/include/portability.h @@ -0,0 +1,81 @@ +/** + * This code is released under a BSD License. + */ +#ifndef SIMDBITCOMPAT_H_ +#define SIMDBITCOMPAT_H_ + +#include /* mostly for Microsoft compilers */ +#include + +#ifdef SIMDCOMP_DEBUG +#define SIMDCOMP_ALWAYS_INLINE inline +#define SIMDCOMP_NEVER_INLINE +#define SIMDCOMP_PURE +#else +#if defined(__GNUC__) +#if __GNUC__ >= 3 +#define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline)) +#define SIMDCOMP_NEVER_INLINE __attribute__((noinline)) +#define SIMDCOMP_PURE __attribute__((pure)) +#else +#define SIMDCOMP_ALWAYS_INLINE inline +#define SIMDCOMP_NEVER_INLINE +#define SIMDCOMP_PURE +#endif +#elif defined(_MSC_VER) +#define SIMDCOMP_ALWAYS_INLINE __forceinline +#define SIMDCOMP_NEVER_INLINE +#define SIMDCOMP_PURE +#else +#if __has_attribute(always_inline) +#define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline)) +#else +#define SIMDCOMP_ALWAYS_INLINE inline +#endif +#if __has_attribute(noinline) +#define SIMDCOMP_NEVER_INLINE __attribute__((noinline)) +#else +#define SIMDCOMP_NEVER_INLINE +#endif +#if __has_attribute(pure) +#define SIMDCOMP_PURE __attribute__((pure)) +#else +#define SIMDCOMP_PURE +#endif +#endif +#endif + +#if defined(_MSC_VER) && _MSC_VER < 1600 +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +#else +#include /* part of Visual Studio 2010 and better, others likely anyway */ +#endif + +#if defined(_MSC_VER) +#define SIMDCOMP_ALIGNED(x) __declspec(align(x)) +#else +#if defined(__GNUC__) +#define SIMDCOMP_ALIGNED(x) __attribute__((aligned(x))) +#endif +#endif + +#if defined(_MSC_VER) +#include +/* 64-bit needs extending */ +#define SIMDCOMP_CTZ(result, mask) \ + do { \ + unsigned long index; \ + if (!_BitScanForward(&(index), (mask))) { \ + (result) = 32U; \ + } else { \ + (result) = (uint32_t)(index); \ + } \ + } while (0) +#else +#include +#define SIMDCOMP_CTZ(result, mask) result = __builtin_ctz(mask) +#endif + +#endif /* SIMDBITCOMPAT_H_ */ diff --git a/include/simdbitpacking.h b/include/simdbitpacking.h index 301f4f5..52f04de 100644 --- a/include/simdbitpacking.h +++ b/include/simdbitpacking.h @@ -4,18 +4,72 @@ #ifndef SIMDBITPACKING_H_ #define SIMDBITPACKING_H_ -#include // SSE2 is required -#include // use a C99-compliant compiler, please -#include // for memset +#include "portability.h" -//reads 128 values from "in", writes "bit" 128-bit vectors to "out" -void simdpack(const uint32_t * in,__m128i * out, uint32_t bit); +/* SSE2 is required */ +#include +/* for memset */ +#include -//reads 128 values from "in", writes "bit" 128-bit vectors to "out" -void simdpackwithoutmask(const uint32_t * in,__m128i * out, uint32_t bit); +#include "simdcomputil.h" -//reads "bit" 128-bit vectors from "in", writes 128 values to "out" -void simdunpack(const __m128i * in,uint32_t * out, uint32_t bit); +/*** + * Please see example.c for various examples on how to make good use + * of these functions. + */ + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out". + * The input values are masked so that only the least significant "bit" bits are + * used. */ +void simdpack(const uint32_t *in, __m128i *out, const uint32_t bit); + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out". + * The input values are assumed to be less than 1<// SSE2 is required -#include // use a C99-compliant compiler, please +#include "portability.h" +/* SSE2 is required */ +#include +/* returns the integer logarithm of v (bit width) */ +uint32_t bits(const uint32_t v); +/* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */ +uint32_t maxbits(const uint32_t *begin); -// returns the integer logarithm of v (bit width) -uint32_t bits(const uint32_t v); +/* same as maxbits, but we specify the number of integers */ +uint32_t maxbits_length(const uint32_t *in, uint32_t length); + +enum { SIMDBlockSize = 128 }; + +/* computes (quickly) the minimal value of 128 values */ +uint32_t simdmin(const uint32_t *in); -// max integer logarithm over a range -uint32_t maxbits(const uint32_t * begin); +/* computes (quickly) the minimal value of the specified number of values */ +uint32_t simdmin_length(const uint32_t *in, uint32_t length); -enum{ SIMDBlockSize = 128}; +#ifdef __SSE4_1__ +/* computes (quickly) the minimal and maximal value of the specified number of + * values */ +void simdmaxmin_length(const uint32_t *in, uint32_t length, uint32_t *getmin, + uint32_t *getmax); -// like maxbit over 128 integers (SIMDBlockSize) with provided initial value -// and using differential coding -uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in); +/* computes (quickly) the minimal and maximal value of the 128 values */ +void simdmaxmin(const uint32_t *in, uint32_t *getmin, uint32_t *getmax); +#endif +/* like maxbit over 128 integers (SIMDBlockSize) with provided initial value + and using differential coding */ +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t *in); +/* like simdmaxbitsd1, but calculates maxbits over |length| integers + with provided initial value. |length| can be any arbitrary value. */ +uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t *in, + uint32_t length); #endif /* SIMDCOMPUTIL_H_ */ diff --git a/include/simdfor.h b/include/simdfor.h new file mode 100644 index 0000000..74642f5 --- /dev/null +++ b/include/simdfor.h @@ -0,0 +1,72 @@ +/** + * This code is released under a BSD License. + */ +#ifndef INCLUDE_SIMDFOR_H_ +#define INCLUDE_SIMDFOR_H_ + +#include "portability.h" + +/* SSE2 is required */ +#include + +#include "simdbitpacking.h" +#include "simdcomputil.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" */ +void simdpackFOR(uint32_t initvalue, const uint32_t *in, __m128i *out, + const uint32_t bit); + +/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */ +void simdunpackFOR(uint32_t initvalue, const __m128i *in, uint32_t *out, + const uint32_t bit); + +/* how many compressed bytes are needed to compressed length integers using a +bit width of bit with the simdpackFOR_length function. */ +int simdpackFOR_compressedbytes(int length, const uint32_t bit); + +/* like simdpackFOR, but supports an undetermined number of inputs. +This is useful if you need to pack less than 128 integers. Note that this +function is much slower. Compressed data is stored in the memory location +between the provided (out) pointer and the returned pointer. */ +__m128i *simdpackFOR_length(uint32_t initvalue, const uint32_t *in, int length, + __m128i *out, const uint32_t bit); + +/* like simdunpackFOR, but supports an undetermined number of inputs. +This is useful if you need to unpack less than 128 integers. Note that this +function is much slower. The read compressed data is between the provided (in) +pointer and the returned pointer. */ +const __m128i *simdunpackFOR_length(uint32_t initvalue, const __m128i *in, + int length, uint32_t *out, + const uint32_t bit); + +/* returns the value stored at the specified "slot". + * */ +uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int slot); + +/* given a block of 128 packed values, this function sets the value at index + * "index" to "value" */ +void simdfastsetFOR(uint32_t initvalue, __m128i *in, uint32_t bit, + uint32_t value, size_t index); + +/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for + * the first encoded uint32 value which is >= |key|, and returns its position. + * It is assumed that the values stored are in sorted order. The encoded key is + * stored in "*presult". The first length decoded integers, ignoring others. If + * no value is larger or equal to the key, length is returned. Length should be + * no larger than 128. + * + * If no value is larger or equal to the key, + * length is returned */ +int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* INCLUDE_SIMDFOR_H_ */ diff --git a/include/simdintegratedbitpacking.h b/include/simdintegratedbitpacking.h index 18ca795..d8f3f2f 100644 --- a/include/simdintegratedbitpacking.h +++ b/include/simdintegratedbitpacking.h @@ -5,23 +5,92 @@ #ifndef SIMD_INTEGRATED_BITPACKING_H #define SIMD_INTEGRATED_BITPACKING_H -#include // SSE2 is required -#include // use a C99-compliant compiler, please +#include "portability.h" +/* SSE2 is required */ +#include + +#include "simdbitpacking.h" #include "simdcomputil.h" -//reads 128 values from "in", writes "bit" 128-bit vectors to "out" -// integer values should be in sorted order (for best results) -void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); +#ifdef __cplusplus +extern "C" { +#endif + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" + integer values should be in sorted order (for best results). + The differences are masked so that only the least significant "bit" bits are + used. */ +void simdpackd1(uint32_t initvalue, const uint32_t *in, __m128i *out, + const uint32_t bit); +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" + integer values should be in sorted order (for best results). + The difference values are assumed to be less than 1<= |key|, and returns its position. It is + *assumed that the values stored are in sorted order. The encoded key is stored + *in "*presult". If no value is larger or equal to the key, 128 is returned. The + *pointer initOffset is a pointer to the last four value decoded (when starting + *out, this can be a zero vector or initialized with _mm_set1_epi32(init)), and + *the vector gets updated. + **/ +int simdsearchd1(__m128i *initOffset, const __m128i *in, uint32_t bit, + uint32_t key, uint32_t *presult); -//reads "bit" 128-bit vectors from "in", writes 128 values to "out" -void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, uint32_t bit); +/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for + * the first encoded uint32 value which is >= |key|, and returns its position. + * It is assumed that the values stored are in sorted order. The encoded key is + * stored in "*presult". The first length decoded integers, ignoring others. If + * no value is larger or equal to the key, length is returned. Length should be + * no larger than 128. + * + * If no value is larger or equal to the key, + * length is returned */ +int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult); + +/* returns the value stored at the specified "slot". + * */ +uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit, + int slot); + +/* given a block of 128 packed values, this function sets the value at index + * "index" to "value", you must somehow know the previous value. Because of + * differential coding, all following values are incremented by the offset + * between this new value and the old value... This functions is useful if you + * want to modify the last value. + */ +void simdfastsetd1fromprevious(__m128i *in, uint32_t bit, + uint32_t previousvalue, uint32_t value, + size_t index); +/* given a block of 128 packed values, this function sets the value at index + * "index" to "value", This function computes the previous value if needed. + * Because of differential coding, all following values are incremented by the + * offset between this new value and the old value... This functions is useful + * if you want to modify the last value. + */ +void simdfastsetd1(uint32_t initvalue, __m128i *in, uint32_t bit, + uint32_t value, size_t index); + +/*Simply scan the data + * The pointer initOffset is a pointer to the last four value decoded + * (when starting out, this can be a zero vector or initialized with + * _mm_set1_epi32(init);), and the vector gets updated. + * */ + +void simdscand1(__m128i *initOffset, const __m128i *in, uint32_t bit); + +#ifdef __cplusplus +} // extern "C" +#endif #endif diff --git a/makefile b/makefile deleted file mode 100644 index eefed83..0000000 --- a/makefile +++ /dev/null @@ -1,54 +0,0 @@ -# minimalist makefile -.SUFFIXES: -# -.SUFFIXES: .cpp .o .c .h - -CFLAGS = -fPIC -std=c99 -O3 -Wall -Wextra -Wno-unused-parameter -pedantic -LDFLAGS = -shared -LIBNAME=libsimdcomp.so.0.0.2 -all: unit $(LIBNAME) -test: - ./unit -install: $(OBJECTS) - cp $(LIBNAME) /usr/local/lib - ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so - ldconfig - cp $(HEADERS) /usr/local/include - - - -HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h - -uninstall: - for h in $(HEADERS) ; do rm /usr/local/$$h; done - rm /usr/local/lib/$(LIBNAME) - rm /usr/local/lib/libsimdcomp.so - ldconfig - - -OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o - -$(LIBNAME): $(OBJECTS) - $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) - - - -simdcomputil.o: ./src/simdcomputil.c $(HEADERS) - $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude - -simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) - $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude - -simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) - $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude - -example: ./example.c $(HEADERS) $(OBJECTS) - $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) - -unit: ./src/unit.c $(HEADERS) $(OBJECTS) - $(CC) $(CFLAGS) -o unit ./src/unit.c -Iinclude $(OBJECTS) -dynunit: ./src/unit.c $(HEADERS) $(LIBNAME) - $(CC) $(CFLAGS) -o dynunit ./src/unit.c -Iinclude -lsimdcomp - -clean: - rm -f unit *.o $(LIBNAME) diff --git a/makefile.vc b/makefile.vc new file mode 100644 index 0000000..c298b45 --- /dev/null +++ b/makefile.vc @@ -0,0 +1,168 @@ + +!IFNDEF MACHINE +!IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64" +MACHINE=x64 +!ELSE +MACHINE=x86 +!ENDIF +!ENDIF + +!IFNDEF VC +VC=vc%VisualStudioVersion:~0,-2% +!ENDIF + +# catch up when there's a stronger versioning +!IFNDEF PKG_VERSION +PKG_VERSION=latest +!ENDIF + +!IFNDEF DEBUG +DEBUG=no +!ENDIF + +!IFNDEF CC +CC=cl.exe +!ENDIF + +!IFNDEF AR +AR=lib.exe +!ENDIF + +!IFNDEF LINK +LINK=link.exe +!ENDIF + +!IFNDEF PGO +PGO=no +!ENDIF + +!IFNDEF PGI +PGI=no +!ENDIF + +INC = /Iinclude + +!IF "$(DEBUG)"=="yes" +CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm /D __SSE4_1__=1 +ARFLAGS = /nologo +LDFLAGS = /nologo /debug /nodefaultlib:msvcrt +!ELSE +CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP /D __SSE4_1__=1 +ARFLAGS = /nologo /LTCG +LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf +!ENDIF + +!IF "$(PGI)"=="yes" +LDFLAGS = $(LDFLAGS) /ltcg:pgi +!ENDIF + +!IF "$(PGO)"=="yes" +LDFLAGS = $(LDFLAGS) /ltcg:pgo +!ENDIF + +# SSE4.1 is required +# VC++15.3 supports AVX512 +!IF "$(AVX512)"=="yes" +CFLAGS = $(CFLAGS) /arch:AVX2 /D __AVX2__=1 /D __AVX512F__=1 +AVX2=yes +!ELSEIF "$(AVX2)"=="yes" +CFLAGS = $(CFLAGS) /arch:AVX2 /D __AVX2__=1 +!ENDIF + +LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \ + simdpackedsearch.obj simdpackedselect.obj simdfor.obj + +LIB_SRCS = src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \ + src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c + +PKG_FEATURES=sse4.1 + +!IF "$(AVX2)"=="yes" +LIB_OBJS = $(LIB_OBJS) avxbitpacking.obj +LIB_SRCS = $(LIB_SRCS) src/avxbitpacking.c +PKG_FEATURES=avx2 +!ENDIF + +!IF "$(AVX512)"=="yes" +LIB_OBJS = $(LIB_OBJS) avx512bitpacking.obj +LIB_SRCS = $(LIB_SRCS) src/avx512bitpacking.c +PKG_FEATURES=avx512 +!ENDIF + + +all: lib dll dynunit unit_chars example benchmarks +# need some good use case scenario to train the instrumented build + @if "$(PGI)"=="yes" echo Running PGO training + @if "$(PGI)"=="yes" benchmark.exe >nul 2>&1 +# @if "$(PGI)"=="yes" bitpackingbenchmark.exe >nul 2>&1 + @if "$(PGI)"=="yes" example.exe >nul 2>&1 + + +$(LIB_OBJS): + $(CC) $(INC) $(CFLAGS) /c $(LIB_SRCS) + +lib: $(LIB_OBJS) + @copy simdcomp.def.tpl simdcomp.def + @if "$(AVX2)"=="yes" echo avxunpack >> simdcomp.def + @if "$(AVX2)"=="yes" echo avxpackwithoutmask >> simdcomp.def + @if "$(AVX2)"=="yes" echo avxpack >> simdcomp.def + @if "$(AVX2)"=="yes" echo avxmaxbits >> simdcomp.def + @if "$(AVX512)"=="yes" echo avx512unpack >> simdcomp.def + @if "$(AVX512)"=="yes" echo avx512packwithoutmask >> simdcomp.def + @if "$(AVX512)"=="yes" echo avx512pack >> simdcomp.def + @if "$(AVX512)"=="yes" echo avx512maxbits >> simdcomp.def + $(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS) + +dll: $(LIB_OBJS) + $(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS) + +unit: lib + $(CC) $(INC) $(CFLAGS) /c tests/unit.c + $(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib + +dynunit: dll + $(CC) $(INC) $(CFLAGS) /c tests/unit.c + $(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib + +unit_chars: lib + $(CC) $(INC) $(CFLAGS) /c tests/unit_chars.c + $(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib + + +example: lib + $(CC) $(INC) $(CFLAGS) /c example.c + $(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib + +benchmarks: lib + $(CC) $(INC) $(CFLAGS) /c benchmarks/benchmark.c + $(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib +# $(CC) $(INC) $(CFLAGS) /c benchmarks/bitpackingbenchmark.c +# $(LINK) $(LDFLAGS) /OUT:bitpackingbenchmark.exe bitpackingbenchmark.obj simdcomp.lib + +pack: + mkdir .\package + cd .\package + mkdir .\include + mkdir .\bin + mkdir .\lib + copy ..\include\*.h .\include + copy ..\simdcomp.dll .\bin + copy ..\simdcomp.pdb .\bin + copy ..\simdcomp.lib .\lib + copy ..\simdcomp_a.lib .\lib + copy ..\LICENSE . + copy ..\README.md . + 7z a ..\simdcomp-$(PKG_VERSION)-$(PKG_FEATURES)-$(VC)-$(MACHINE).zip . + cd .. + powershell -Command "Remove-Item -Recurse -Force .\package" + +clean: + powershell -Command "Remove-Item -Force *.obj" + powershell -Command "Remove-Item -Force *.lib" + powershell -Command "Remove-Item -Force *.exe" + powershell -Command "Remove-Item -Force *.dll" + powershell -Command "Remove-Item -Force *.pgc" + powershell -Command "Remove-Item -Force *.pgd" + powershell -Command "Remove-Item -Force *.pdb" + powershell -Command "Remove-Item -Force *.def" + diff --git a/package.json b/package.json new file mode 100644 index 0000000..a91dd24 --- /dev/null +++ b/package.json @@ -0,0 +1,16 @@ +{ + "name": "simdcomp", + "version": "0.0.3", + "repo": "lemire/simdcomp", + "description": "A simple C library for compressing lists of integers", + "license": "BSD-3-Clause", + "src": [ + "src/simdbitpacking.c", + "src/simdcomputil.c", + "src/simdintegratedbitpacking.c", + "include/simdbitpacking.h", + "include/simdcomp.h", + "include/simdcomputil.h", + "include/simdintegratedbitpacking.h" + ] +} diff --git a/scripts/avx512packing.py b/scripts/avx512packing.py new file mode 100755 index 0000000..b7c0b52 --- /dev/null +++ b/scripts/avx512packing.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python +import sys +def howmany(bit): + """ how many values are we going to pack? """ + return 512 + +def howmanywords(bit): + return (howmany(bit) * bit + 511)/512 + +def howmanybytes(bit): + return howmanywords(bit) * 32 + +print(""" +/** avx512packing **/ +""") + +print("""typedef void (*avx512packblockfnc)(const uint32_t * pin, __m512i * compressed);""") +print("""typedef void (*avx512unpackblockfnc)(const __m512i * compressed, uint32_t * pout);""") + + + + + + +def plurial(number): + if(number <> 1): + return "s" + else : + return "" + +print("") +print("static void avx512packblock0(const uint32_t * pin, __m512i * compressed) {"); +print(" (void)compressed;"); +print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we are going to pack {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avx512packblock{0}(const uint32_t * pin, __m512i * compressed) {{".format(bit)); + print(" const __m512i * in = (const __m512i *) pin;"); + print(" /* we are going to touch {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m512i w0;") + else: + print(" __m512i w0, w1;") + if( (bit & (bit-1)) <> 0) : print(" __m512i tmp; /* used to store inputs at word boundary */") + oldword = 0 + for j in range(howmany(bit)/16): + firstword = j * bit / 32 + if(firstword > oldword): + print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(oldword,oldword%2)) + oldword = firstword + secondword = (j * bit + bit - 1)/32 + firstshift = (j*bit) % 32 + if( firstword == secondword): + if(firstshift == 0): + print(" w{0} = _mm512_loadu_si512 (in + {1});".format(firstword%2,j)) + else: + print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(_mm512_loadu_si512 (in + {1}) , {2}));".format(firstword%2,j,firstshift)) + else: + print(" tmp = _mm512_loadu_si512 (in + {0});".format(j)) + print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) + secondshift = 32-firstshift + print(" w{0} = _mm512_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) + print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(secondword,secondword%2)) + print("}"); + print("") + + +print("") +print("static void avx512packblockmask0(const uint32_t * pin, __m512i * compressed) {"); +print(" (void)compressed;"); +print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we are going to pack {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avx512packblockmask{0}(const uint32_t * pin, __m512i * compressed) {{".format(bit)); + print(" /* we are going to touch {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m512i w0;") + else: + print(" __m512i w0, w1;") + print(" const __m512i * in = (const __m512i *) pin;"); + if(bit < 32): print(" const __m512i mask = _mm512_set1_epi32({0});".format((1< 0) : print(" __m512i tmp; /* used to store inputs at word boundary */") + oldword = 0 + for j in range(howmany(bit)/16): + firstword = j * bit / 32 + if(firstword > oldword): + print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(oldword,oldword%2)) + oldword = firstword + secondword = (j * bit + bit - 1)/32 + firstshift = (j*bit) % 32 + loadstr = maskfnc(" _mm512_loadu_si512 (in + {0}) ".format(j)) + if( firstword == secondword): + if(firstshift == 0): + print(" w{0} = {1};".format(firstword%2,loadstr)) + else: + print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift)) + else: + print(" tmp = {0};".format(loadstr)) + print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) + secondshift = 32-firstshift + print(" w{0} = _mm512_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) + print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(secondword,secondword%2)) + print("}"); + print("") + + +print("static void avx512unpackblock0(const __m512i * compressed, uint32_t * pout) {"); +print(" (void) compressed;"); +print(" memset(pout,0,{0});".format(howmany(0))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we packed {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avx512unpackblock{0}(const __m512i * compressed, uint32_t * pout) {{".format(bit)); + print(" /* we are going to access {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m512i w0;") + else: + print(" __m512i w0, w1;") + print(" __m512i * out = (__m512i *) pout;"); + if(bit < 32): print(" const __m512i mask = _mm512_set1_epi32({0});".format((1< oldword): + print(" w{0} = _mm512_loadu_si512 (compressed + {1});".format(secondword%2,secondword)) + oldword = secondword + firstshift = (j*bit) % 32 + firstshiftstr = "_mm512_srli_epi32( w{0} , "+str(firstshift)+") " + if(firstshift == 0): + firstshiftstr =" w{0} " # no need + wfirst = firstshiftstr.format(firstword%2) + if( firstword == secondword): + if(firstshift + bit <> 32): + wfirst = maskstr.format(wfirst) + print(" _mm512_storeu_si512(out + {0}, {1});".format(j,wfirst)) + else: + secondshift = (32-firstshift) + wsecond = "_mm512_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift) + wfirstorsecond = " _mm512_or_si512 ({0},{1}) ".format(wfirst,wsecond) + wfirstorsecond = maskstr.format(wfirstorsecond) + print(" _mm512_storeu_si512(out + {0},\n {1});".format(j,wfirstorsecond)) + print("}"); + print("") + + +print("static avx512packblockfnc avx512funcPackArr[] = {") +for bit in range(0,32): + print("&avx512packblock{0},".format(bit)) +print("&avx512packblock32") +print("};") + +print("static avx512packblockfnc avx512funcPackMaskArr[] = {") +for bit in range(0,32): + print("&avx512packblockmask{0},".format(bit)) +print("&avx512packblockmask32") +print("};") + + +print("static avx512unpackblockfnc avx512funcUnpackArr[] = {") +for bit in range(0,32): + print("&avx512unpackblock{0},".format(bit)) +print("&avx512unpackblock32") +print("};") +print("/** avx512packing **/") diff --git a/scripts/avxpacking.py b/scripts/avxpacking.py new file mode 100755 index 0000000..fccb69b --- /dev/null +++ b/scripts/avxpacking.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python +import sys +def howmany(bit): + """ how many values are we going to pack? """ + return 256 + +def howmanywords(bit): + return (howmany(bit) * bit + 255)/256 + +def howmanybytes(bit): + return howmanywords(bit) * 16 + +print(""" +/** avxpacking **/ +""") + +print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""") +print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""") + + + + + + +def plurial(number): + if(number <> 1): + return "s" + else : + return "" + +print("") +print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {"); +print(" (void)compressed;"); +print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit)); + print(" const __m256i * in = (const __m256i *) pin;"); + print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m256i w0;") + else: + print(" __m256i w0, w1;") + if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */") + oldword = 0 + for j in range(howmany(bit)/8): + firstword = j * bit / 32 + if(firstword > oldword): + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2)) + oldword = firstword + secondword = (j * bit + bit - 1)/32 + firstshift = (j*bit) % 32 + if( firstword == secondword): + if(firstshift == 0): + print(" w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j)) + else: + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift)) + else: + print(" tmp = _mm256_lddqu_si256 (in + {0});".format(j)) + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) + secondshift = 32-firstshift + print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2)) + print("}"); + print("") + + +print("") +print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {"); +print(" (void)compressed;"); +print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit)); + print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m256i w0;") + else: + print(" __m256i w0, w1;") + print(" const __m256i * in = (const __m256i *) pin;"); + if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1< 0) : print(" __m256i tmp; /* used to store inputs at word boundary */") + oldword = 0 + for j in range(howmany(bit)/8): + firstword = j * bit / 32 + if(firstword > oldword): + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2)) + oldword = firstword + secondword = (j * bit + bit - 1)/32 + firstshift = (j*bit) % 32 + loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j)) + if( firstword == secondword): + if(firstshift == 0): + print(" w{0} = {1};".format(firstword%2,loadstr)) + else: + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift)) + else: + print(" tmp = {0};".format(loadstr)) + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) + secondshift = 32-firstshift + print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2)) + print("}"); + print("") + + +print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {"); +print(" (void) compressed;"); +print(" memset(pout,0,{0});".format(howmany(0))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit)); + print(" /* we are going to access {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m256i w0;") + else: + print(" __m256i w0, w1;") + print(" __m256i * out = (__m256i *) pout;"); + if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1< oldword): + print(" w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword)) + oldword = secondword + firstshift = (j*bit) % 32 + firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") " + if(firstshift == 0): + firstshiftstr =" w{0} " # no need + wfirst = firstshiftstr.format(firstword%2) + if( firstword == secondword): + if(firstshift + bit <> 32): + wfirst = maskstr.format(wfirst) + print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst)) + else: + secondshift = (32-firstshift) + wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift) + wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond) + wfirstorsecond = maskstr.format(wfirstorsecond) + print(" _mm256_storeu_si256(out + {0},\n {1});".format(j,wfirstorsecond)) + print("}"); + print("") + + +print("static avxpackblockfnc avxfuncPackArr[] = {") +for bit in range(0,32): + print("&avxpackblock{0},".format(bit)) +print("&avxpackblock32") +print("};") + +print("static avxpackblockfnc avxfuncPackMaskArr[] = {") +for bit in range(0,32): + print("&avxpackblockmask{0},".format(bit)) +print("&avxpackblockmask32") +print("};") + + +print("static avxunpackblockfnc avxfuncUnpackArr[] = {") +for bit in range(0,32): + print("&avxunpackblock{0},".format(bit)) +print("&avxunpackblock32") +print("};") +print("/** avxpacking **/") diff --git a/scripts/simdfor.py b/scripts/simdfor.py new file mode 100755 index 0000000..c60db1e --- /dev/null +++ b/scripts/simdfor.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 + + +from math import ceil + +print(""" +/** +* Blablabla +* +*/ + +"""); + +def mask(bit): + return str((1 << bit) - 1) + +for length in [32]: + print(""" +static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { + __m128i *out = (__m128i*)(_out); + int i; + (void) _in; + for (i = 0; i < 8; ++i) { + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + } + + return initOffset; +} + + """) + print(""" + +static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { + (void) initOffset; + (void) _in; + (void) out; +} +""") + for bit in range(1,33): + offsetVar = " initOffset"; + print(""" +static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + """); + + if (bit != 32): + print(" __m128i CurrIn = _mm_load_si128(in);"); + print(" __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);"); + else: + print(" __m128i InReg = _mm_load_si128(in);"); + print(" (void) initOffset;"); + + + inwordpointer = 0 + valuecounter = 0 + for k in range(ceil((length * bit) / 32)): + if(valuecounter == length): break + for x in range(inwordpointer,32,bit): + if(x!=0) : + print(" OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));"); + else: + print(" OutReg = InReg; "); + if((x+bit>=32) ): + while(inwordpointer<32): + inwordpointer += bit + print(" _mm_store_si128(out, OutReg);"); + print(""); + + if(valuecounter + 1 < length): + print(" ++out;") + inwordpointer -= 32; + if(inwordpointer>0): + print(" OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");"); + if(valuecounter + 1 < length): + print(" ++in;") + + if (bit != 32): + print(" CurrIn = _mm_load_si128(in);"); + print(" InReg = _mm_sub_epi32(CurrIn, initOffset);"); + else: + print(" InReg = _mm_load_si128(in);"); + print(""); + valuecounter = valuecounter + 1 + if(valuecounter == length): break + assert(valuecounter == length) + print("\n}\n\n""") + + for bit in range(1,32): + offsetVar = " initOffset"; + print("""\n +static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const __m128i* in, uint32_t * _out) { + """); + print(""" __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<"""+str(bit)+""")-1); + + """); + + MainText = ""; + + MainText += "\n"; + inwordpointer = 0 + valuecounter = 0 + for k in range(ceil((length * bit) / 32)): + for x in range(inwordpointer,32,bit): + if(valuecounter == length): break + if (x > 0): + MainText += " tmp = _mm_srli_epi32(InReg," + str(x) +");\n"; + else: + MainText += " tmp = InReg;\n"; + if(x+bit<32): + MainText += " OutReg = _mm_and_si128(tmp, mask);\n"; + else: + MainText += " OutReg = tmp;\n"; + if((x+bit>=32) ): + while(inwordpointer<32): + inwordpointer += bit + if(valuecounter + 1 < length): + MainText += " ++in;" + MainText += " InReg = _mm_load_si128(in);\n"; + inwordpointer -= 32; + if(inwordpointer>0): + MainText += " OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n"; + if (bit != 32): + MainText += " OutReg = _mm_add_epi32(OutReg, initOffset);\n"; + MainText += " _mm_store_si128(out++, OutReg);\n\n"; + MainText += ""; + valuecounter = valuecounter + 1 + if(valuecounter == length): break + assert(valuecounter == length) + print(MainText) + print(" return initOffset;"); + print("\n}\n\n") + print(""" +static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) { + __m128i * mout = (__m128i *)_out; + __m128i invec; + size_t k; + for(k = 0; k < 128/4; ++k) { + invec = _mm_load_si128(in++); + _mm_store_si128(mout++, invec); + } + return invec; +} + """) diff --git a/simdcomp.def.tpl b/simdcomp.def.tpl new file mode 100644 index 0000000..87d7582 --- /dev/null +++ b/simdcomp.def.tpl @@ -0,0 +1,41 @@ +EXPORTS + simdpack + simdpackwithoutmask + simdunpack + bits + maxbits + maxbits_length + simdmin + simdmin_length + simdmaxmin + simdmaxmin_length + simdmaxbitsd1 + simdmaxbitsd1_length + simdpackd1 + simdpackwithoutmaskd1 + simdunpackd1 + simdsearchd1 + simdsearchwithlengthd1 + simdselectd1 + simdpackFOR + simdselectFOR + simdsearchwithlengthFOR + simdunpackFOR + simdmin_length + simdmaxmin + simdmaxmin_length + simdpack_length + simdpackFOR_length + simdunpackFOR_length + simdpackFOR_compressedbytes + simdpack_shortlength + simdpack_compressedbytes + simdfastsetFOR + simdfastset + simdfastsetd1 + simdunpack_length + simdunpack_shortlength + simdsearchwithlengthFOR + simdscand1 + simdfastsetd1fromprevious + diff --git a/src/avx512bitpacking.c b/src/avx512bitpacking.c new file mode 100644 index 0000000..ac93ae6 --- /dev/null +++ b/src/avx512bitpacking.c @@ -0,0 +1,9932 @@ +#include "avx512bitpacking.h" +#ifdef __AVX512F__ + +static uint32_t maxbitas32int(const __m256i accumulator) { + const __m256i _tmp1 = + _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator); + const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1); + uint32_t ans1 = _mm256_extract_epi32(_tmp2, 0); + uint32_t ans2 = _mm256_extract_epi32(_tmp2, 4); + uint32_t ans = ans1 > ans2 ? ans1 : ans2; + return ans; +} + +static uint32_t avx512maxbitas32int(const __m512i accumulator) { + uint32_t ans1 = maxbitas32int(_mm512_castsi512_si256(accumulator)); + uint32_t ans2 = maxbitas32int(_mm512_extracti64x4_epi64(accumulator, 1)); + uint32_t ans = ans1 > ans2 ? ans1 : ans2; + return bits(ans); +} + +uint32_t avx512maxbits(const uint32_t *begin) { + const __m512i *pin = (const __m512i *)(begin); + __m512i accumulator = _mm512_loadu_si512(pin); + uint32_t k = 1; + for (; 16 * k < AVX512BlockSize; ++k) { + __m512i newvec = _mm512_loadu_si512(pin + k); + accumulator = _mm512_or_si512(accumulator, newvec); + } + return avx512maxbitas32int(accumulator); +} + +/** avx512packing **/ + +typedef void (*avx512packblockfnc)(const uint32_t *pin, __m512i *compressed); +typedef void (*avx512unpackblockfnc)(const __m512i *compressed, uint32_t *pout); + +static void avx512packblock0(const uint32_t *pin, __m512i *compressed) { + (void)compressed; + (void)pin; /* we consumed 512 32-bit integers */ +} + +/* we are going to pack 512 1-bit values, touching 1 512-bit words, using 32 + * bytes */ +static void avx512packblock1(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 1 512-bit word */ + __m512i w0; + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 15)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 31)); + _mm512_storeu_si512(compressed + 0, w0); +} + +/* we are going to pack 512 2-bit values, touching 2 512-bit words, using 64 + * bytes */ +static void avx512packblock2(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 2 512-bit words */ + __m512i w0, w1; + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 30)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 30)); + _mm512_storeu_si512(compressed + 1, w1); +} + +/* we are going to pack 512 3-bit values, touching 3 512-bit words, using 96 + * bytes */ +static void avx512packblock3(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 3 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 15)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 27)); + tmp = _mm512_loadu_si512(in + 10); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 1)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 28)); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 29)); + _mm512_storeu_si512(compressed + 2, w0); +} + +/* we are going to pack 512 4-bit values, touching 4 512-bit words, using 128 + * bytes */ +static void avx512packblock4(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 4 512-bit words */ + __m512i w0, w1; + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 28)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 28)); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 28)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 28)); + _mm512_storeu_si512(compressed + 3, w1); +} + +/* we are going to pack 512 5-bit values, touching 5 512-bit words, using 160 + * bytes */ +static void avx512packblock5(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 5 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 15)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 25)); + tmp = _mm512_loadu_si512(in + 6); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 23)); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 26)); + tmp = _mm512_loadu_si512(in + 19); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 24)); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 27)); + _mm512_storeu_si512(compressed + 4, w0); +} + +/* we are going to pack 512 6-bit values, touching 6 512-bit words, using 192 + * bytes */ +static void avx512packblock6(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 6 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 24)); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 22)); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 26)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 24)); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 22)); + tmp = _mm512_loadu_si512(in + 26); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 26)); + _mm512_storeu_si512(compressed + 5, w1); +} + +/* we are going to pack 512 7-bit values, touching 7 512-bit words, using 224 + * bytes */ +static void avx512packblock7(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 7 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 21)); + tmp = _mm512_loadu_si512(in + 4); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 24)); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 20)); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 23)); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 19)); + tmp = _mm512_loadu_si512(in + 22); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 1)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 15)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 22)); + tmp = _mm512_loadu_si512(in + 27); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 25)); + _mm512_storeu_si512(compressed + 6, w0); +} + +/* we are going to pack 512 8-bit values, touching 8 512-bit words, using 256 + * bytes */ +static void avx512packblock8(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 8 512-bit words */ + __m512i w0, w1; + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 24)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 24)); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 24)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 24)); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 24)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_loadu_si512(in + 20); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 24)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 24)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 24)); + _mm512_storeu_si512(compressed + 7, w1); +} + +/* we are going to pack 512 9-bit values, touching 9 512-bit words, using 288 + * bytes */ +static void avx512packblock9(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 9 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 18)); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 22)); + tmp = _mm512_loadu_si512(in + 7); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 17)); + tmp = _mm512_loadu_si512(in + 10); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 21)); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + tmp = _mm512_loadu_si512(in + 17); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 20)); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 15)); + tmp = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 1)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 19)); + tmp = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 23)); + _mm512_storeu_si512(compressed + 8, w0); +} + +/* we are going to pack 512 10-bit values, touching 10 512-bit words, using 320 + * bytes */ +static void avx512packblock10(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 10 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 20)); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 18)); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 16)); + tmp = _mm512_loadu_si512(in + 9); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 14)); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 22)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 20)); + tmp = _mm512_loadu_si512(in + 19); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 18)); + tmp = _mm512_loadu_si512(in + 22); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 16)); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 14)); + tmp = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 22)); + _mm512_storeu_si512(compressed + 9, w1); +} + +/* we are going to pack 512 11-bit values, touching 11 512-bit words, using 352 + * bytes */ +static void avx512packblock11(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 11 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 11)); + tmp = _mm512_loadu_si512(in + 2); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 1)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 12)); + tmp = _mm512_loadu_si512(in + 5); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 13)); + tmp = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 14)); + tmp = _mm512_loadu_si512(in + 11); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 15)); + tmp = _mm512_loadu_si512(in + 14); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 5)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 17)); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 18)); + tmp = _mm512_loadu_si512(in + 23); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 19)); + tmp = _mm512_loadu_si512(in + 26); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 20)); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 21)); + _mm512_storeu_si512(compressed + 10, w0); +} + +/* we are going to pack 512 12-bit values, touching 12 512-bit words, using 384 + * bytes */ +static void avx512packblock12(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 12 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 12)); + tmp = _mm512_loadu_si512(in + 2); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 16)); + tmp = _mm512_loadu_si512(in + 5); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 20)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 12)); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 16)); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 20)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 12)); + tmp = _mm512_loadu_si512(in + 18); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 16)); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 20)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 12)); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 16)); + tmp = _mm512_loadu_si512(in + 29); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 20)); + _mm512_storeu_si512(compressed + 11, w1); +} + +/* we are going to pack 512 13-bit values, touching 13 512-bit words, using 416 + * bytes */ +static void avx512packblock13(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 13 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 13)); + tmp = _mm512_loadu_si512(in + 2); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 7)); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 14)); + tmp = _mm512_loadu_si512(in + 7); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 8)); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 21)); + w0 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 15)); + tmp = _mm512_loadu_si512(in + 12); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 9)); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + tmp = _mm512_loadu_si512(in + 17); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 10)); + tmp = _mm512_loadu_si512(in + 19); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 17)); + tmp = _mm512_loadu_si512(in + 22); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 11)); + tmp = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 18)); + tmp = _mm512_loadu_si512(in + 27); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 12)); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 19)); + _mm512_storeu_si512(compressed + 12, w0); +} + +/* we are going to pack 512 14-bit values, touching 14 512-bit words, using 448 + * bytes */ +static void avx512packblock14(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 14 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 14)); + tmp = _mm512_loadu_si512(in + 2); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 10)); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 6)); + tmp = _mm512_loadu_si512(in + 6); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 16)); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 12)); + tmp = _mm512_loadu_si512(in + 11); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 8)); + tmp = _mm512_loadu_si512(in + 13); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 18)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 14)); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 10)); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 6)); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 16)); + tmp = _mm512_loadu_si512(in + 25); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 12)); + tmp = _mm512_loadu_si512(in + 27); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 8)); + tmp = _mm512_loadu_si512(in + 29); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 18)); + _mm512_storeu_si512(compressed + 13, w1); +} + +/* we are going to pack 512 15-bit values, touching 15 512-bit words, using 480 + * bytes */ +static void avx512packblock15(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 15 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 15)); + tmp = _mm512_loadu_si512(in + 2); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 13)); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 11)); + tmp = _mm512_loadu_si512(in + 6); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 9)); + tmp = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 7)); + tmp = _mm512_loadu_si512(in + 10); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 5)); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 3)); + tmp = _mm512_loadu_si512(in + 14); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 1)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 16), 16)); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 14)); + tmp = _mm512_loadu_si512(in + 19); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 12)); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 10)); + tmp = _mm512_loadu_si512(in + 23); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 8)); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 6)); + tmp = _mm512_loadu_si512(in + 27); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 4)); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 30), 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 17)); + _mm512_storeu_si512(compressed + 14, w0); +} + +/* we are going to pack 512 16-bit values, touching 16 512-bit words, using 512 + * bytes */ +static void avx512packblock16(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 16 512-bit words */ + __m512i w0, w1; + w0 = _mm512_loadu_si512(in + 0); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 1), 16)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 16)); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_loadu_si512(in + 4); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 16)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 16)); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 16)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 16)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_loadu_si512(in + 12); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 16)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 16)); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 16)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 16)); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 16)); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 16)); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 16)); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 16)); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 16)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 16)); + _mm512_storeu_si512(compressed + 15, w1); +} + +/* we are going to pack 512 17-bit values, touching 17 512-bit words, using 544 + * bytes */ +static void avx512packblock17(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 17 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 2)); + tmp = _mm512_loadu_si512(in + 3); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 4)); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 6)); + tmp = _mm512_loadu_si512(in + 7); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 8)); + tmp = _mm512_loadu_si512(in + 9); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 10)); + tmp = _mm512_loadu_si512(in + 11); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 12)); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 14)); + tmp = _mm512_loadu_si512(in + 15); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 1)); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 3)); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 5)); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 7)); + tmp = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 9)); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 11)); + tmp = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 13)); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 15)); + _mm512_storeu_si512(compressed + 16, w0); +} + +/* we are going to pack 512 18-bit values, touching 18 512-bit words, using 576 + * bytes */ +static void avx512packblock18(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 18 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 4)); + tmp = _mm512_loadu_si512(in + 3); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 8)); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 12)); + tmp = _mm512_loadu_si512(in + 7); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 2)); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 6)); + tmp = _mm512_loadu_si512(in + 12); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 10)); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 14)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_loadu_si512(in + 16); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 4)); + tmp = _mm512_loadu_si512(in + 19); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 8)); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 12)); + tmp = _mm512_loadu_si512(in + 23); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 2)); + tmp = _mm512_loadu_si512(in + 26); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 6)); + tmp = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 10)); + tmp = _mm512_loadu_si512(in + 30); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 14)); + _mm512_storeu_si512(compressed + 17, w1); +} + +/* we are going to pack 512 19-bit values, touching 19 512-bit words, using 608 + * bytes */ +static void avx512packblock19(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 19 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 6)); + tmp = _mm512_loadu_si512(in + 3); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 12)); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 5)); + tmp = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 11)); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 11); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 4)); + tmp = _mm512_loadu_si512(in + 13); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 10)); + tmp = _mm512_loadu_si512(in + 15); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 3)); + tmp = _mm512_loadu_si512(in + 18); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 9)); + tmp = _mm512_loadu_si512(in + 20); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 21); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 2)); + tmp = _mm512_loadu_si512(in + 23); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 21)); + w0 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 8)); + tmp = _mm512_loadu_si512(in + 25); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 1)); + tmp = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 7)); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 13)); + _mm512_storeu_si512(compressed + 18, w0); +} + +/* we are going to pack 512 20-bit values, touching 20 512-bit words, using 640 + * bytes */ +static void avx512packblock20(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 20 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 8)); + tmp = _mm512_loadu_si512(in + 3); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 4); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 4)); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 12)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_loadu_si512(in + 8); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 8)); + tmp = _mm512_loadu_si512(in + 11); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 4)); + tmp = _mm512_loadu_si512(in + 14); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 12)); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_loadu_si512(in + 16); + tmp = _mm512_loadu_si512(in + 17); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 8)); + tmp = _mm512_loadu_si512(in + 19); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 4)); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 12)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_loadu_si512(in + 24); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 8)); + tmp = _mm512_loadu_si512(in + 27); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 4)); + tmp = _mm512_loadu_si512(in + 30); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 12)); + _mm512_storeu_si512(compressed + 19, w1); +} + +/* we are going to pack 512 21-bit values, touching 21 512-bit words, using 672 + * bytes */ +static void avx512packblock21(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 21 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 2), 10)); + tmp = _mm512_loadu_si512(in + 3); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 4); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 9)); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 7); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 8), 8)); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 10); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 7)); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 6)); + tmp = _mm512_loadu_si512(in + 15); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 5)); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 19); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 20), 4)); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_loadu_si512(in + 22); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 3)); + tmp = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 25); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 13)); + w1 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 2)); + tmp = _mm512_loadu_si512(in + 27); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 29), 1)); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 11)); + _mm512_storeu_si512(compressed + 20, w0); +} + +/* we are going to pack 512 22-bit values, touching 22 512-bit words, using 704 + * bytes */ +static void avx512packblock22(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 22 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 2)); + tmp = _mm512_loadu_si512(in + 4); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 5); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 4)); + tmp = _mm512_loadu_si512(in + 7); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 6)); + tmp = _mm512_loadu_si512(in + 10); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 11); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 8)); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 10)); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_loadu_si512(in + 16); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 18); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 2)); + tmp = _mm512_loadu_si512(in + 20); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_loadu_si512(in + 21); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 4)); + tmp = _mm512_loadu_si512(in + 23); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 6)); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 27); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 8)); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_loadu_si512(in + 30); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 10)); + _mm512_storeu_si512(compressed + 21, w1); +} + +/* we are going to pack 512 23-bit values, touching 23 512-bit words, using 736 + * bytes */ +static void avx512packblock23(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 23 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 23)); + w1 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 5)); + tmp = _mm512_loadu_si512(in + 4); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 5); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 6); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 1)); + tmp = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 9); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 6)); + tmp = _mm512_loadu_si512(in + 11); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 12); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 13); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 11)); + w0 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 14), 2)); + tmp = _mm512_loadu_si512(in + 15); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 17), 7)); + tmp = _mm512_loadu_si512(in + 18); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 19); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 21)); + w0 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 3)); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 23); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 24), 8)); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 26); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 27); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 13)); + w0 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 28), 4)); + tmp = _mm512_loadu_si512(in + 29); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 21, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 9)); + _mm512_storeu_si512(compressed + 22, w0); +} + +/* we are going to pack 512 24-bit values, touching 24 512-bit words, using 768 + * bytes */ +static void avx512packblock24(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 24 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 3), 8)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 4); + tmp = _mm512_loadu_si512(in + 5); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 6); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 8)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_loadu_si512(in + 8); + tmp = _mm512_loadu_si512(in + 9); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 11), 8)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_loadu_si512(in + 12); + tmp = _mm512_loadu_si512(in + 13); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_loadu_si512(in + 14); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 8)); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_loadu_si512(in + 16); + tmp = _mm512_loadu_si512(in + 17); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 8)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_loadu_si512(in + 20); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 22); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 8)); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_loadu_si512(in + 24); + tmp = _mm512_loadu_si512(in + 25); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 8)); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_loadu_si512(in + 28); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 30); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 22, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 8)); + _mm512_storeu_si512(compressed + 23, w1); +} + +/* we are going to pack 512 25-bit values, touching 25 512-bit words, using 800 + * bytes */ +static void avx512packblock25(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 25 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 4), 4)); + tmp = _mm512_loadu_si512(in + 5); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 6); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_loadu_si512(in + 7); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 15)); + w0 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 9), 1)); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 11); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 13), 5)); + tmp = _mm512_loadu_si512(in + 14); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 15); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 9)); + w0 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 18), 2)); + tmp = _mm512_loadu_si512(in + 19); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_loadu_si512(in + 20); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 21); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 13)); + w1 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 22), 6)); + tmp = _mm512_loadu_si512(in + 23); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 17)); + w0 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_loadu_si512(in + 26); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 27), 3)); + tmp = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 29); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 23, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 7)); + _mm512_storeu_si512(compressed + 24, w0); +} + +/* we are going to pack 512 26-bit values, touching 26 512-bit words, using 832 + * bytes */ +static void avx512packblock26(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 26 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 5), 2)); + tmp = _mm512_loadu_si512(in + 6); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_loadu_si512(in + 7); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 4)); + tmp = _mm512_loadu_si512(in + 11); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 6)); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_loadu_si512(in + 16); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_loadu_si512(in + 18); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_loadu_si512(in + 19); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 2)); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 23); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_loadu_si512(in + 25); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 26), 4)); + tmp = _mm512_loadu_si512(in + 27); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_loadu_si512(in + 30); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 24, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 6)); + _mm512_storeu_si512(compressed + 25, w1); +} + +/* we are going to pack 512 27-bit values, touching 27 512-bit words, using 864 + * bytes */ +static void avx512packblock27(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 27 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 7)); + w1 = _mm512_srli_epi32(tmp, 25); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 6), 2)); + tmp = _mm512_loadu_si512(in + 7); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 8); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 10); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 11); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 9)); + w0 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 12), 4)); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 15); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_loadu_si512(in + 17); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 19), 1)); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 22); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 23); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 13)); + w0 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 25), 3)); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 27); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_loadu_si512(in + 29); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 25, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 5)); + _mm512_storeu_si512(compressed + 26, w0); +} + +/* we are going to pack 512 28-bit values, touching 28 512-bit words, using 896 + * bytes */ +static void avx512packblock28(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 28 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 7), 4)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_loadu_si512(in + 8); + tmp = _mm512_loadu_si512(in + 9); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 10); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 11); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_loadu_si512(in + 12); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 13); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 14); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 4)); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_loadu_si512(in + 16); + tmp = _mm512_loadu_si512(in + 17); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 19); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_loadu_si512(in + 20); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 21); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 23), 4)); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_loadu_si512(in + 24); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 26); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_loadu_si512(in + 27); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_loadu_si512(in + 30); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 26, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 4)); + _mm512_storeu_si512(compressed + 27, w1); +} + +/* we are going to pack 512 29-bit values, touching 29 512-bit words, using 928 + * bytes */ +static void avx512packblock29(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 29 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 23)); + w1 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 7); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 9); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 5)); + w1 = _mm512_srli_epi32(tmp, 27); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 10), 2)); + tmp = _mm512_loadu_si512(in + 11); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_loadu_si512(in + 12); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 13); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 14); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 15); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_loadu_si512(in + 16); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 13)); + w0 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 18); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_loadu_si512(in + 19); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 7)); + w0 = _mm512_srli_epi32(tmp, 25); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 4)); + w1 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 21), 1)); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_loadu_si512(in + 23); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 25); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_loadu_si512(in + 27); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_loadu_si512(in + 29); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 9)); + w1 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 26, w0); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 27, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 3)); + _mm512_storeu_si512(compressed + 28, w0); +} + +/* we are going to pack 512 30-bit values, touching 30 512-bit words, using 960 + * bytes */ +static void avx512packblock30(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 30 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 7); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 9); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_loadu_si512(in + 11); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 6)); + w1 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 4)); + w0 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 15), 2)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_loadu_si512(in + 16); + tmp = _mm512_loadu_si512(in + 17); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 18); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_loadu_si512(in + 19); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 20); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 21); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_loadu_si512(in + 22); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_loadu_si512(in + 23); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 24); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_loadu_si512(in + 25); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_loadu_si512(in + 26); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_loadu_si512(in + 27); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_loadu_si512(in + 28); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 26, w0); + tmp = _mm512_loadu_si512(in + 29); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 27, w1); + tmp = _mm512_loadu_si512(in + 30); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 4)); + w1 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 28, w0); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 2)); + _mm512_storeu_si512(compressed + 29, w1); +} + +/* we are going to pack 512 31-bit values, touching 31 512-bit words, using 992 + * bytes */ +static void avx512packblock31(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 31 512-bit words */ + __m512i w0, w1; + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_loadu_si512(in + 0); + tmp = _mm512_loadu_si512(in + 1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_loadu_si512(in + 2); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_loadu_si512(in + 3); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_loadu_si512(in + 4); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_loadu_si512(in + 5); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_loadu_si512(in + 6); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_loadu_si512(in + 7); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_loadu_si512(in + 8); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_loadu_si512(in + 9); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 23)); + w1 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_loadu_si512(in + 10); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_loadu_si512(in + 11); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_loadu_si512(in + 12); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_loadu_si512(in + 13); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_loadu_si512(in + 14); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_loadu_si512(in + 15); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_loadu_si512(in + 16); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_loadu_si512(in + 17); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_loadu_si512(in + 18); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_loadu_si512(in + 19); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 13)); + w1 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_loadu_si512(in + 20); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_loadu_si512(in + 21); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_loadu_si512(in + 22); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_loadu_si512(in + 23); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 9)); + w1 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_loadu_si512(in + 24); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_loadu_si512(in + 25); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 7)); + w1 = _mm512_srli_epi32(tmp, 25); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_loadu_si512(in + 26); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_loadu_si512(in + 27); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 5)); + w1 = _mm512_srli_epi32(tmp, 27); + _mm512_storeu_si512(compressed + 26, w0); + tmp = _mm512_loadu_si512(in + 28); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 4)); + w0 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 27, w1); + tmp = _mm512_loadu_si512(in + 29); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 3)); + w1 = _mm512_srli_epi32(tmp, 29); + _mm512_storeu_si512(compressed + 28, w0); + tmp = _mm512_loadu_si512(in + 30); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 2)); + w0 = _mm512_srli_epi32(tmp, 30); + _mm512_storeu_si512(compressed + 29, w1); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(_mm512_loadu_si512(in + 31), 1)); + _mm512_storeu_si512(compressed + 30, w0); +} + +/* we are going to pack 512 32-bit values, touching 32 512-bit words, using 1024 + * bytes */ +static void avx512packblock32(const uint32_t *pin, __m512i *compressed) { + const __m512i *in = (const __m512i *)pin; + /* we are going to touch 32 512-bit words */ + __m512i w0, w1; + w0 = _mm512_loadu_si512(in + 0); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_loadu_si512(in + 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_loadu_si512(in + 2); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 3); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_loadu_si512(in + 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_loadu_si512(in + 5); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_loadu_si512(in + 6); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_loadu_si512(in + 7); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_loadu_si512(in + 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_loadu_si512(in + 9); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_loadu_si512(in + 10); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_loadu_si512(in + 11); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_loadu_si512(in + 12); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_loadu_si512(in + 13); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_loadu_si512(in + 14); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_loadu_si512(in + 15); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_loadu_si512(in + 16); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_loadu_si512(in + 17); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_loadu_si512(in + 18); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_loadu_si512(in + 19); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_loadu_si512(in + 20); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_loadu_si512(in + 21); + _mm512_storeu_si512(compressed + 21, w1); + w0 = _mm512_loadu_si512(in + 22); + _mm512_storeu_si512(compressed + 22, w0); + w1 = _mm512_loadu_si512(in + 23); + _mm512_storeu_si512(compressed + 23, w1); + w0 = _mm512_loadu_si512(in + 24); + _mm512_storeu_si512(compressed + 24, w0); + w1 = _mm512_loadu_si512(in + 25); + _mm512_storeu_si512(compressed + 25, w1); + w0 = _mm512_loadu_si512(in + 26); + _mm512_storeu_si512(compressed + 26, w0); + w1 = _mm512_loadu_si512(in + 27); + _mm512_storeu_si512(compressed + 27, w1); + w0 = _mm512_loadu_si512(in + 28); + _mm512_storeu_si512(compressed + 28, w0); + w1 = _mm512_loadu_si512(in + 29); + _mm512_storeu_si512(compressed + 29, w1); + w0 = _mm512_loadu_si512(in + 30); + _mm512_storeu_si512(compressed + 30, w0); + w1 = _mm512_loadu_si512(in + 31); + _mm512_storeu_si512(compressed + 31, w1); +} + +static void avx512packblockmask0(const uint32_t *pin, __m512i *compressed) { + (void)compressed; + (void)pin; /* we consumed 512 32-bit integers */ +} + +/* we are going to pack 512 1-bit values, touching 1 512-bit words, using 32 + * bytes */ +static void avx512packblockmask1(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 1 512-bit word */ + __m512i w0; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 1)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 2)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 3)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), 4)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 5)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), 6)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 7)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), 8)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 9)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 10)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 11)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 13)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 14)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 15)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 17)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 18)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 19)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 21)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 22)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 23)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 24)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 25)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 26)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 27)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 28)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 29)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 30)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 31)); + _mm512_storeu_si512(compressed + 0, w0); +} + +/* we are going to pack 512 2-bit values, touching 2 512-bit words, using 64 + * bytes */ +static void avx512packblockmask2(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 2 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(3); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 2)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 4)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 6)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 10)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 14)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 18)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 22)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 24)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 26)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 28)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 30)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 2)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 6)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 10)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 12)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 14)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 18)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 20)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 22)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 24)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 26)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 28)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 30)); + _mm512_storeu_si512(compressed + 1, w1); +} + +/* we are going to pack 512 3-bit values, touching 3 512-bit words, using 96 + * bytes */ +static void avx512packblockmask3(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 3 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(7); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 3)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 6)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 9)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 15)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 18)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 21)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), + 24)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 27)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 1)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 7)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 10)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 13)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 19)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 22)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 25)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 28)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 5)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 11)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 14)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 17)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 23)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 26)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 29)); + _mm512_storeu_si512(compressed + 2, w0); +} + +/* we are going to pack 512 4-bit values, touching 4 512-bit words, using 128 + * bytes */ +static void avx512packblockmask4(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 4 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(15); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 4)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 24)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 28)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 12)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 20)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 24)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 28)); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 24)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 28)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 12)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 20)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 24)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 28)); + _mm512_storeu_si512(compressed + 3, w1); +} + +/* we are going to pack 512 5-bit values, touching 5 512-bit words, using 160 + * bytes */ +static void avx512packblockmask5(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 5 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(31); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 5)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), + 10)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 15)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 25)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 3)); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 13)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 18)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 23)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 1)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 6)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 11)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 21)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 26)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 9)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 14)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 19)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 24)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 7)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 17)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 22)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 27)); + _mm512_storeu_si512(compressed + 4, w0); +} + +/* we are going to pack 512 6-bit values, touching 6 512-bit words, using 192 + * bytes */ +static void avx512packblockmask6(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 6 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(63); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 6)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 18)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), + 24)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 10)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 22)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 14)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 26)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 6)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 12)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 18)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 24)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 10)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 22)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 2)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 14)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 20)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 26)); + _mm512_storeu_si512(compressed + 5, w1); +} + +/* we are going to pack 512 7-bit values, touching 7 512-bit words, using 224 + * bytes */ +static void avx512packblockmask7(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 7 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(127); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 7)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), + 14)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 21)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 3)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 10)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 17)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), + 24)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 6)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 13)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 20)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 2)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 9)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 23)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 5)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 19)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 1)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 15)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 22)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 11)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 18)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 25)); + _mm512_storeu_si512(compressed + 6, w0); +} + +/* we are going to pack 512 8-bit values, touching 8 512-bit words, using 256 + * bytes */ +static void avx512packblockmask8(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 8 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(255); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 24)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 24)); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 24)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 24)); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 24)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 24)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 24)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 24)); + _mm512_storeu_si512(compressed + 7, w1); +} + +/* we are going to pack 512 9-bit values, touching 9 512-bit words, using 288 + * bytes */ +static void avx512packblockmask9(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 9 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(511); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), 9)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), + 18)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 13)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 22)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 17)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 3)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 12)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 21)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 7)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 2)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 11)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 20)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 6)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 15)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 1)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 10)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 19)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 5)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 14)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 23)); + _mm512_storeu_si512(compressed + 8, w0); +} + +/* we are going to pack 512 10-bit values, touching 10 512-bit words, using 320 + * bytes */ +static void avx512packblockmask10(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 10 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(1023); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), + 10)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), + 20)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 18)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 6)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 22)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 10)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 20)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 18)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 6)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 2)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 12)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 22)); + _mm512_storeu_si512(compressed + 9, w1); +} + +/* we are going to pack 512 11-bit values, touching 11 512-bit words, using 352 + * bytes */ +static void avx512packblockmask11(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 11 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(2047); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), + 11)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 1)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 13)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 3)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 15)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 5)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 6)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 17)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 7)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 18)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 19)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 9)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 20)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 10)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 21)); + _mm512_storeu_si512(compressed + 10, w0); +} + +/* we are going to pack 512 12-bit values, touching 12 512-bit words, using 384 + * bytes */ +static void avx512packblockmask12(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 12 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(4095); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 20)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 20)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 20)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 8)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 20)); + _mm512_storeu_si512(compressed + 11, w1); +} + +/* we are going to pack 512 13-bit values, touching 13 512-bit words, using 416 + * bytes */ +static void avx512packblockmask13(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 13 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(8191); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), + 13)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 7)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 1)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 21)); + w0 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 15)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 9)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 3)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 17)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 11)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 5)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 18)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 6)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 19)); + _mm512_storeu_si512(compressed + 12, w0); +} + +/* we are going to pack 512 14-bit values, touching 14 512-bit words, using 448 + * bytes */ +static void avx512packblockmask14(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 14 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(16383); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 2)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 18)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 4)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 18)); + _mm512_storeu_si512(compressed + 13, w1); +} + +/* we are going to pack 512 15-bit values, touching 15 512-bit words, using 480 + * bytes */ +static void avx512packblockmask15(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 15 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(32767); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), + 15)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 13)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 11)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 9)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 7)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 5)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 3)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 1)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 16)), + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 30)), + 2)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 17)); + _mm512_storeu_si512(compressed + 14, w0); +} + +/* we are going to pack 512 16-bit values, touching 16 512-bit words, using 512 + * bytes */ +static void avx512packblockmask16(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 16 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(65535); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 1)), + 16)); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), + 16)); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), + 16)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 16)); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 16)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 16)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 16)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 16)); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 16)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 16)); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 16)); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 16)); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 16)); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 16)); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 16)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 16)); + _mm512_storeu_si512(compressed + 15, w1); +} + +/* we are going to pack 512 17-bit values, touching 17 512-bit words, using 544 + * bytes */ +static void avx512packblockmask17(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 17 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(131071); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 14)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 1)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 3)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 5)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 7)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 9)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 11)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 13)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 15)); + _mm512_storeu_si512(compressed + 16, w0); +} + +/* we are going to pack 512 18-bit values, touching 18 512-bit words, using 576 + * bytes */ +static void avx512packblockmask18(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 18 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(262143); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 14)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 14)); + _mm512_storeu_si512(compressed + 17, w1); +} + +/* we are going to pack 512 19-bit values, touching 19 512-bit words, using 608 + * bytes */ +static void avx512packblockmask19(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 19 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(524287); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 5)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), + 11)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 3)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 9)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 21)); + w0 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 1)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 7)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 13)); + _mm512_storeu_si512(compressed + 18, w0); +} + +/* we are going to pack 512 20-bit values, touching 20 512-bit words, using 640 + * bytes */ +static void avx512packblockmask20(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 20 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(1048575); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), + 12)); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 12)); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 12)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 12)); + _mm512_storeu_si512(compressed + 19, w1); +} + +/* we are going to pack 512 21-bit values, touching 21 512-bit words, using 672 + * bytes */ +static void avx512packblockmask21(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 21 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(2097151); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 2)), + 10)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 9)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 8)), 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 7)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 27)); + w0 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 5)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 20)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 3)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 13)); + w1 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 29)), + 1)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 11)); + _mm512_storeu_si512(compressed + 20, w0); +} + +/* we are going to pack 512 22-bit values, touching 22 512-bit words, using 704 + * bytes */ +static void avx512packblockmask22(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 22 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(4194303); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 10)); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 10)); + _mm512_storeu_si512(compressed + 21, w1); +} + +/* we are going to pack 512 23-bit values, touching 23 512-bit words, using 736 + * bytes */ +static void avx512packblockmask23(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 23 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(8388607); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 23)); + w1 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 5)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 1)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 11)); + w0 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 14)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 17)), + 7)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 21)); + w0 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 3)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 24)), + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 13)); + w0 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 28)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 21, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 9)); + _mm512_storeu_si512(compressed + 22, w0); +} + +/* we are going to pack 512 24-bit values, touching 24 512-bit words, using 768 + * bytes */ +static void avx512packblockmask24(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 24 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(16777215); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 3)), 8)); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 8)); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 11)), + 8)); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 8)); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 8)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 8)); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 8)); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 22, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 8)); + _mm512_storeu_si512(compressed + 23, w1); +} + +/* we are going to pack 512 25-bit values, touching 25 512-bit words, using 800 + * bytes */ +static void avx512packblockmask25(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 25 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(33554431); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 4)), 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 15)); + w0 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 9)), 1)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 13)), + 5)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 9)); + w0 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 18)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 13)); + w1 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 22)), + 6)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 17)); + w0 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 27)), + 3)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 23, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 7)); + _mm512_storeu_si512(compressed + 24, w0); +} + +/* we are going to pack 512 26-bit values, touching 26 512-bit words, using 832 + * bytes */ +static void avx512packblockmask26(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 26 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(67108863); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 5)), 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 6)); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 26)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 24, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 6)); + _mm512_storeu_si512(compressed + 25, w1); +} + +/* we are going to pack 512 27-bit values, touching 27 512-bit words, using 864 + * bytes */ +static void avx512packblockmask27(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 27 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(134217727); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 7)); + w1 = _mm512_srli_epi32(tmp, 25); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_or_si512( + w1, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 6)), 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 29)); + w0 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 9)); + w0 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 12)), + 4)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 19)), + 1)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 23)); + w0 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 13)); + w0 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 25)), + 3)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 25, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 5)); + _mm512_storeu_si512(compressed + 26, w0); +} + +/* we are going to pack 512 28-bit values, touching 28 512-bit words, using 896 + * bytes */ +static void avx512packblockmask28(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 28 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(268435455); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_or_si512( + w0, + _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 7)), 4)); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 4)); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 23)), + 4)); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 26, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 4)); + _mm512_storeu_si512(compressed + 27, w1); +} + +/* we are going to pack 512 29-bit values, touching 29 512-bit words, using 928 + * bytes */ +static void avx512packblockmask29(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 29 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(536870911); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 23)); + w1 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 5)); + w1 = _mm512_srli_epi32(tmp, 27); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 10)), + 2)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 31)); + w0 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 25)); + w0 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 19)); + w0 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 13)); + w0 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 7)); + w0 = _mm512_srli_epi32(tmp, 25); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 4)); + w1 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 21)), + 1)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 9)); + w1 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 26, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 27, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 3)); + _mm512_storeu_si512(compressed + 28, w0); +} + +/* we are going to pack 512 30-bit values, touching 30 512-bit words, using 960 + * bytes */ +static void avx512packblockmask30(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 30 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(1073741823); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 30)); + w1 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 26)); + w1 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 22)); + w1 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 18)); + w1 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 14)); + w1 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 10)); + w1 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 6)); + w1 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 4)); + w0 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 15)), + 2)); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 28)); + w1 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 24)); + w1 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 20)); + w1 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 16)); + w1 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 12)); + w1 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 8)); + w1 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 26, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 27, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 4)); + w1 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 28, w0); + w1 = _mm512_or_si512( + w1, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 2)); + _mm512_storeu_si512(compressed + 29, w1); +} + +/* we are going to pack 512 31-bit values, touching 31 512-bit words, using 992 + * bytes */ +static void avx512packblockmask31(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 31 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + const __m512i mask = _mm512_set1_epi32(2147483647); + __m512i tmp; /* used to store inputs at word boundary */ + w0 = _mm512_and_si512(mask, _mm512_loadu_si512(in + 0)); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 1)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 31)); + w1 = _mm512_srli_epi32(tmp, 1); + _mm512_storeu_si512(compressed + 0, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 2)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 30)); + w0 = _mm512_srli_epi32(tmp, 2); + _mm512_storeu_si512(compressed + 1, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 3)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 29)); + w1 = _mm512_srli_epi32(tmp, 3); + _mm512_storeu_si512(compressed + 2, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 4)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 28)); + w0 = _mm512_srli_epi32(tmp, 4); + _mm512_storeu_si512(compressed + 3, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 5)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 27)); + w1 = _mm512_srli_epi32(tmp, 5); + _mm512_storeu_si512(compressed + 4, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 6)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 26)); + w0 = _mm512_srli_epi32(tmp, 6); + _mm512_storeu_si512(compressed + 5, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 7)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 25)); + w1 = _mm512_srli_epi32(tmp, 7); + _mm512_storeu_si512(compressed + 6, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 8)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 24)); + w0 = _mm512_srli_epi32(tmp, 8); + _mm512_storeu_si512(compressed + 7, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 9)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 23)); + w1 = _mm512_srli_epi32(tmp, 9); + _mm512_storeu_si512(compressed + 8, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 10)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 22)); + w0 = _mm512_srli_epi32(tmp, 10); + _mm512_storeu_si512(compressed + 9, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 11)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 21)); + w1 = _mm512_srli_epi32(tmp, 11); + _mm512_storeu_si512(compressed + 10, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 12)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 20)); + w0 = _mm512_srli_epi32(tmp, 12); + _mm512_storeu_si512(compressed + 11, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 13)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 19)); + w1 = _mm512_srli_epi32(tmp, 13); + _mm512_storeu_si512(compressed + 12, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 14)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 18)); + w0 = _mm512_srli_epi32(tmp, 14); + _mm512_storeu_si512(compressed + 13, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 15)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 17)); + w1 = _mm512_srli_epi32(tmp, 15); + _mm512_storeu_si512(compressed + 14, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 16)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 16)); + w0 = _mm512_srli_epi32(tmp, 16); + _mm512_storeu_si512(compressed + 15, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 17)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 15)); + w1 = _mm512_srli_epi32(tmp, 17); + _mm512_storeu_si512(compressed + 16, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 18)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 14)); + w0 = _mm512_srli_epi32(tmp, 18); + _mm512_storeu_si512(compressed + 17, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 19)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 13)); + w1 = _mm512_srli_epi32(tmp, 19); + _mm512_storeu_si512(compressed + 18, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 20)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 12)); + w0 = _mm512_srli_epi32(tmp, 20); + _mm512_storeu_si512(compressed + 19, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 21)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 11)); + w1 = _mm512_srli_epi32(tmp, 21); + _mm512_storeu_si512(compressed + 20, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 22)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 10)); + w0 = _mm512_srli_epi32(tmp, 22); + _mm512_storeu_si512(compressed + 21, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 23)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 9)); + w1 = _mm512_srli_epi32(tmp, 23); + _mm512_storeu_si512(compressed + 22, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 24)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 8)); + w0 = _mm512_srli_epi32(tmp, 24); + _mm512_storeu_si512(compressed + 23, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 25)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 7)); + w1 = _mm512_srli_epi32(tmp, 25); + _mm512_storeu_si512(compressed + 24, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 26)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 6)); + w0 = _mm512_srli_epi32(tmp, 26); + _mm512_storeu_si512(compressed + 25, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 27)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 5)); + w1 = _mm512_srli_epi32(tmp, 27); + _mm512_storeu_si512(compressed + 26, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 28)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 4)); + w0 = _mm512_srli_epi32(tmp, 28); + _mm512_storeu_si512(compressed + 27, w1); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 29)); + w0 = _mm512_or_si512(w0, _mm512_slli_epi32(tmp, 3)); + w1 = _mm512_srli_epi32(tmp, 29); + _mm512_storeu_si512(compressed + 28, w0); + tmp = _mm512_and_si512(mask, _mm512_loadu_si512(in + 30)); + w1 = _mm512_or_si512(w1, _mm512_slli_epi32(tmp, 2)); + w0 = _mm512_srli_epi32(tmp, 30); + _mm512_storeu_si512(compressed + 29, w1); + w0 = _mm512_or_si512( + w0, _mm512_slli_epi32(_mm512_and_si512(mask, _mm512_loadu_si512(in + 31)), + 1)); + _mm512_storeu_si512(compressed + 30, w0); +} + +/* we are going to pack 512 32-bit values, touching 32 512-bit words, using 1024 + * bytes */ +static void avx512packblockmask32(const uint32_t *pin, __m512i *compressed) { + /* we are going to touch 32 512-bit words */ + __m512i w0, w1; + const __m512i *in = (const __m512i *)pin; + w0 = _mm512_loadu_si512(in + 0); + _mm512_storeu_si512(compressed + 0, w0); + w1 = _mm512_loadu_si512(in + 1); + _mm512_storeu_si512(compressed + 1, w1); + w0 = _mm512_loadu_si512(in + 2); + _mm512_storeu_si512(compressed + 2, w0); + w1 = _mm512_loadu_si512(in + 3); + _mm512_storeu_si512(compressed + 3, w1); + w0 = _mm512_loadu_si512(in + 4); + _mm512_storeu_si512(compressed + 4, w0); + w1 = _mm512_loadu_si512(in + 5); + _mm512_storeu_si512(compressed + 5, w1); + w0 = _mm512_loadu_si512(in + 6); + _mm512_storeu_si512(compressed + 6, w0); + w1 = _mm512_loadu_si512(in + 7); + _mm512_storeu_si512(compressed + 7, w1); + w0 = _mm512_loadu_si512(in + 8); + _mm512_storeu_si512(compressed + 8, w0); + w1 = _mm512_loadu_si512(in + 9); + _mm512_storeu_si512(compressed + 9, w1); + w0 = _mm512_loadu_si512(in + 10); + _mm512_storeu_si512(compressed + 10, w0); + w1 = _mm512_loadu_si512(in + 11); + _mm512_storeu_si512(compressed + 11, w1); + w0 = _mm512_loadu_si512(in + 12); + _mm512_storeu_si512(compressed + 12, w0); + w1 = _mm512_loadu_si512(in + 13); + _mm512_storeu_si512(compressed + 13, w1); + w0 = _mm512_loadu_si512(in + 14); + _mm512_storeu_si512(compressed + 14, w0); + w1 = _mm512_loadu_si512(in + 15); + _mm512_storeu_si512(compressed + 15, w1); + w0 = _mm512_loadu_si512(in + 16); + _mm512_storeu_si512(compressed + 16, w0); + w1 = _mm512_loadu_si512(in + 17); + _mm512_storeu_si512(compressed + 17, w1); + w0 = _mm512_loadu_si512(in + 18); + _mm512_storeu_si512(compressed + 18, w0); + w1 = _mm512_loadu_si512(in + 19); + _mm512_storeu_si512(compressed + 19, w1); + w0 = _mm512_loadu_si512(in + 20); + _mm512_storeu_si512(compressed + 20, w0); + w1 = _mm512_loadu_si512(in + 21); + _mm512_storeu_si512(compressed + 21, w1); + w0 = _mm512_loadu_si512(in + 22); + _mm512_storeu_si512(compressed + 22, w0); + w1 = _mm512_loadu_si512(in + 23); + _mm512_storeu_si512(compressed + 23, w1); + w0 = _mm512_loadu_si512(in + 24); + _mm512_storeu_si512(compressed + 24, w0); + w1 = _mm512_loadu_si512(in + 25); + _mm512_storeu_si512(compressed + 25, w1); + w0 = _mm512_loadu_si512(in + 26); + _mm512_storeu_si512(compressed + 26, w0); + w1 = _mm512_loadu_si512(in + 27); + _mm512_storeu_si512(compressed + 27, w1); + w0 = _mm512_loadu_si512(in + 28); + _mm512_storeu_si512(compressed + 28, w0); + w1 = _mm512_loadu_si512(in + 29); + _mm512_storeu_si512(compressed + 29, w1); + w0 = _mm512_loadu_si512(in + 30); + _mm512_storeu_si512(compressed + 30, w0); + w1 = _mm512_loadu_si512(in + 31); + _mm512_storeu_si512(compressed + 31, w1); +} + +static void avx512unpackblock0(const __m512i *compressed, uint32_t *pout) { + (void)compressed; + memset(pout, 0, 512); +} + +/* we packed 512 1-bit values, touching 1 512-bit words, using 32 bytes */ +static void avx512unpackblock1(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 1 512-bit word */ + __m512i w0; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(1); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 1))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 3))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 7))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 9))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 11))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 13))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 15))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 17))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 19))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 21))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 22))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 23))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 24))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 25))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 26))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 27))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 28))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 29))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 30))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 31)); +} + +/* we packed 512 2-bit values, touching 2 512-bit words, using 64 bytes */ +static void avx512unpackblock2(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 2 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(3); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 22))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 24))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 26))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 28))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 30)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 14))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 18))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 20))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 22))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 24))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 26))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 28))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 30)); +} + +/* we packed 512 3-bit values, touching 3 512-bit words, using 96 bytes */ +static void avx512unpackblock3(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 3 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(7); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 3))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 9))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 15))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 21))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 24))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 27))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 7))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 13))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 19))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 22))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 25))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 28))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 11))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 17))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 23))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 26))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 29)); +} + +/* we packed 512 4-bit values, touching 4 512-bit words, using 128 bytes */ +static void avx512unpackblock4(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 4 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(15); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 24))); + _mm512_storeu_si512(out + 7, _mm512_srli_epi32(w0, 28)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512(out + 8, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 20))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 24))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w1, 28)); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 24))); + _mm512_storeu_si512(out + 23, _mm512_srli_epi32(w0, 28)); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512(out + 24, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 20))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 24))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 28)); +} + +/* we packed 512 5-bit values, touching 5 512-bit words, using 160 bytes */ +static void avx512unpackblock5(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 5 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(31); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 15))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 25))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 13))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 18))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 23))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 1))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 11))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 21))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 26))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 31), + _mm512_slli_epi32(w1, 1)))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 14))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 19))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 24))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 7))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 17))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 22))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 27)); +} + +/* we packed 512 6-bit values, touching 6 512-bit words, using 192 bytes */ +static void avx512unpackblock6(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 6 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(63); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 24))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 22))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 26)); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 18))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 24))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 22))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 14))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 20))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 26)); +} + +/* we packed 512 7-bit values, touching 7 512-bit words, using 224 bytes */ +static void avx512unpackblock7(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 7 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(127); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 7))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 21))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 17))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 24))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 13))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 23))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 19))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 15))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 22))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 11))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 25)); +} + +/* we packed 512 8-bit values, touching 8 512-bit words, using 256 bytes */ +static void avx512unpackblock8(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 8 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(255); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 3, _mm512_srli_epi32(w0, 24)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512(out + 4, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 7, _mm512_srli_epi32(w1, 24)); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512(out + 8, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 11, _mm512_srli_epi32(w0, 24)); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512(out + 12, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w1, 24)); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 19, _mm512_srli_epi32(w0, 24)); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512(out + 20, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 23, _mm512_srli_epi32(w1, 24)); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512(out + 24, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + _mm512_storeu_si512(out + 27, _mm512_srli_epi32(w0, 24)); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512(out + 28, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 24)); +} + +/* we packed 512 9-bit values, touching 9 512-bit words, using 288 bytes */ +static void avx512unpackblock9(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 9 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(511); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 9))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 13))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 22))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 17))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 21))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 7))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 25), + _mm512_slli_epi32(w1, 7)))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 11))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 20))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 15))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 19))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 23)); +} + +/* we packed 512 10-bit values, touching 10 512-bit words, using 320 bytes */ +static void avx512unpackblock10(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 10 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(1023); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 20))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 18))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 14))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 22)); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 20))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 22)); +} + +/* we packed 512 11-bit values, touching 11 512-bit words, using 352 bytes */ +static void avx512unpackblock11(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 11 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(2047); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 11))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 13))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 14))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 25), + _mm512_slli_epi32(w0, 7)))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 15))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 5))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 27), + _mm512_slli_epi32(w0, 5)))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 17))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 7))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 18))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 19))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 20))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 21)); +} + +/* we packed 512 12-bit values, touching 12 512-bit words, using 384 bytes */ +static void avx512unpackblock12(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 12 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(4095); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 7, _mm512_srli_epi32(w0, 20)); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512(out + 8, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w1, 20)); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + _mm512_storeu_si512(out + 23, _mm512_srli_epi32(w0, 20)); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512(out + 24, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 20)); +} + +/* we packed 512 13-bit values, touching 13 512-bit words, using 416 bytes */ +static void avx512unpackblock13(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 13 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(8191); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 13))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 7))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 1))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 21), + _mm512_slli_epi32(w0, 11)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 15))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 3))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 29), + _mm512_slli_epi32(w1, 3)))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 17))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 11))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 18))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 31), + _mm512_slli_epi32(w1, 1)))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 25), + _mm512_slli_epi32(w0, 7)))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 19)); +} + +/* we packed 512 14-bit values, touching 14 512-bit words, using 448 bytes */ +static void avx512unpackblock14(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 14 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(16383); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 18)); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 14))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 16))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 18)); +} + +/* we packed 512 15-bit values, touching 15 512-bit words, using 480 bytes */ +static void avx512unpackblock15(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 15 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(32767); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 15))); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 13))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 11))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 7))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 5))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 3))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 18), + _mm512_slli_epi32(w1, 14)))); + _mm512_storeu_si512(out + 15, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + _mm512_storeu_si512(out + 16, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 16))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 14))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 29), + _mm512_slli_epi32(w1, 3)))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 27), + _mm512_slli_epi32(w0, 5)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 25), + _mm512_slli_epi32(w1, 7)))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 21), + _mm512_slli_epi32(w1, 11)))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 19), + _mm512_slli_epi32(w0, 13)))); + _mm512_storeu_si512(out + 30, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 17)); +} + +/* we packed 512 16-bit values, touching 16 512-bit words, using 512 bytes */ +static void avx512unpackblock16(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 16 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(65535); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 1, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512(out + 2, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 3, _mm512_srli_epi32(w1, 16)); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512(out + 4, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 5, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512(out + 6, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 7, _mm512_srli_epi32(w1, 16)); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512(out + 8, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 9, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512(out + 10, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 11, _mm512_srli_epi32(w1, 16)); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512(out + 12, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 13, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512(out + 14, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w1, 16)); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 17, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512(out + 18, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 19, _mm512_srli_epi32(w1, 16)); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512(out + 20, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 21, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512(out + 22, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 23, _mm512_srli_epi32(w1, 16)); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512(out + 24, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 25, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512(out + 26, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 27, _mm512_srli_epi32(w1, 16)); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512(out + 28, _mm512_and_si512(mask, w0)); + _mm512_storeu_si512(out + 29, _mm512_srli_epi32(w0, 16)); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512(out + 30, _mm512_and_si512(mask, w1)); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 16)); +} + +/* we packed 512 17-bit values, touching 17 512-bit words, using 544 bytes */ +static void avx512unpackblock17(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 17 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(131071); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 17), + _mm512_slli_epi32(w1, 15)))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 19), + _mm512_slli_epi32(w0, 13)))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 21), + _mm512_slli_epi32(w1, 11)))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 25), + _mm512_slli_epi32(w1, 7)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 27), + _mm512_slli_epi32(w0, 5)))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 29), + _mm512_slli_epi32(w1, 3)))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 14))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 3))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 5))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 7))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 11))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 13))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 15)); +} + +/* we packed 512 18-bit values, touching 18 512-bit words, using 576 bytes */ +static void avx512unpackblock18(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 18 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(262143); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 18), + _mm512_slli_epi32(w1, 14)))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 12))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 14)); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 14)); +} + +/* we packed 512 19-bit values, touching 19 512-bit words, using 608 bytes */ +static void avx512unpackblock19(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 19 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(524287); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 19), + _mm512_slli_epi32(w1, 13)))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 25), + _mm512_slli_epi32(w0, 7)))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 12))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 31), + _mm512_slli_epi32(w1, 1)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 11))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 17), + _mm512_slli_epi32(w1, 15)))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 10))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 29), + _mm512_slli_epi32(w1, 3)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 3))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 15), + _mm512_slli_epi32(w1, 17)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 21), + _mm512_slli_epi32(w0, 11)))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 1))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 7))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 13)); +} + +/* we packed 512 20-bit values, touching 20 512-bit words, using 640 bytes */ +static void avx512unpackblock20(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 20 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(1048575); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 7, _mm512_srli_epi32(w0, 12)); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512(out + 8, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w1, 12)); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + _mm512_storeu_si512(out + 23, _mm512_srli_epi32(w0, 12)); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512(out + 24, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 12)); +} + +/* we packed 512 21-bit values, touching 21 512-bit words, using 672 bytes */ +static void avx512unpackblock21(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 21 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(2097151); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 21), + _mm512_slli_epi32(w1, 11)))); + _mm512_storeu_si512(out + 2, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 10))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 9))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 19), + _mm512_slli_epi32(w1, 13)))); + _mm512_storeu_si512(out + 8, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 18), + _mm512_slli_epi32(w1, 14)))); + _mm512_storeu_si512(out + 11, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 7))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 17), + _mm512_slli_epi32(w1, 15)))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 27), + _mm512_slli_epi32(w0, 5)))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 5))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 15), + _mm512_slli_epi32(w1, 17)))); + _mm512_storeu_si512(out + 20, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 25), + _mm512_slli_epi32(w0, 7)))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 14), + _mm512_slli_epi32(w1, 18)))); + _mm512_storeu_si512(out + 23, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 13), + _mm512_slli_epi32(w1, 19)))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 12), + _mm512_slli_epi32(w1, 20)))); + _mm512_storeu_si512(out + 29, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 11)); +} + +/* we packed 512 22-bit values, touching 22 512-bit words, using 704 bytes */ +static void avx512unpackblock22(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 22 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(4194303); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 6))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 8))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 10)); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 12), + _mm512_slli_epi32(w1, 20)))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 14), + _mm512_slli_epi32(w1, 18)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 18), + _mm512_slli_epi32(w1, 14)))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 10)); +} + +/* we packed 512 23-bit values, touching 23 512-bit words, using 736 bytes */ +static void avx512unpackblock23(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 23 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(8388607); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 23), + _mm512_slli_epi32(w1, 9)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + _mm512_storeu_si512(out + 3, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 19), + _mm512_slli_epi32(w0, 13)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 10), + _mm512_slli_epi32(w1, 22)))); + _mm512_storeu_si512(out + 7, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 15), + _mm512_slli_epi32(w1, 17)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 11), + _mm512_slli_epi32(w0, 21)))); + _mm512_storeu_si512(out + 14, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 25), + _mm512_slli_epi32(w1, 7)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 17, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 7))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 21), + _mm512_slli_epi32(w0, 11)))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 12), + _mm512_slli_epi32(w1, 20)))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 17), + _mm512_slli_epi32(w1, 15)))); + _mm512_storeu_si512(out + 24, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 8))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 13), + _mm512_slli_epi32(w0, 19)))); + _mm512_storeu_si512(out + 28, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 9)); +} + +/* we packed 512 24-bit values, touching 24 512-bit words, using 768 bytes */ +static void avx512unpackblock24(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 24 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(16777215); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 3, _mm512_srli_epi32(w0, 8)); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512(out + 4, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 7, _mm512_srli_epi32(w1, 8)); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512(out + 8, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 11, _mm512_srli_epi32(w0, 8)); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512(out + 12, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w1, 8)); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 19, _mm512_srli_epi32(w0, 8)); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512(out + 20, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 23, _mm512_srli_epi32(w1, 8)); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512(out + 24, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + _mm512_storeu_si512(out + 27, _mm512_srli_epi32(w0, 8)); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512(out + 28, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 8)); +} + +/* we packed 512 25-bit values, touching 25 512-bit words, using 800 bytes */ +static void avx512unpackblock25(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 25 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(33554431); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 25), + _mm512_slli_epi32(w1, 7)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 11), + _mm512_slli_epi32(w1, 21)))); + _mm512_storeu_si512(out + 4, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 15), + _mm512_slli_epi32(w0, 17)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 8), + _mm512_slli_epi32(w1, 24)))); + _mm512_storeu_si512(out + 9, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 19), + _mm512_slli_epi32(w1, 13)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + _mm512_storeu_si512(out + 13, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 5))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 9), + _mm512_slli_epi32(w0, 23)))); + _mm512_storeu_si512(out + 18, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 13), + _mm512_slli_epi32(w1, 19)))); + _mm512_storeu_si512(out + 22, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 6))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 17), + _mm512_slli_epi32(w0, 15)))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 10), + _mm512_slli_epi32(w1, 22)))); + _mm512_storeu_si512(out + 27, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 21), + _mm512_slli_epi32(w1, 11)))); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 7)); +} + +/* we packed 512 26-bit values, touching 26 512-bit words, using 832 bytes */ +static void avx512unpackblock26(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 26 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(67108863); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 14), + _mm512_slli_epi32(w1, 18)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 8), + _mm512_slli_epi32(w0, 24)))); + _mm512_storeu_si512(out + 5, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 2))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 10), + _mm512_slli_epi32(w0, 22)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 18), + _mm512_slli_epi32(w1, 14)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 6)); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 8), + _mm512_slli_epi32(w1, 24)))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 10), + _mm512_slli_epi32(w1, 22)))); + _mm512_storeu_si512(out + 26, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 4))); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + w1 = _mm512_loadu_si512(compressed + 25); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 12), + _mm512_slli_epi32(w1, 20)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 6)); +} + +/* we packed 512 27-bit values, touching 27 512-bit words, using 864 bytes */ +static void avx512unpackblock27(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 27 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(134217727); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 17), + _mm512_slli_epi32(w1, 15)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 7), + _mm512_slli_epi32(w1, 25)))); + _mm512_storeu_si512(out + 6, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 29), + _mm512_slli_epi32(w0, 3)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 19), + _mm512_slli_epi32(w0, 13)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 14), + _mm512_slli_epi32(w1, 18)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 9), + _mm512_slli_epi32(w0, 23)))); + _mm512_storeu_si512(out + 12, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 4))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 31), + _mm512_slli_epi32(w1, 1)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 21), + _mm512_slli_epi32(w1, 11)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 11), + _mm512_slli_epi32(w1, 21)))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 6), + _mm512_slli_epi32(w0, 26)))); + _mm512_storeu_si512(out + 19, + _mm512_and_si512(mask, _mm512_srli_epi32(w0, 1))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 23), + _mm512_slli_epi32(w0, 9)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 18), + _mm512_slli_epi32(w1, 14)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 13), + _mm512_slli_epi32(w0, 19)))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 8), + _mm512_slli_epi32(w1, 24)))); + _mm512_storeu_si512(out + 25, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 3))); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 25), + _mm512_slli_epi32(w1, 7)))); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 25); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 15), + _mm512_slli_epi32(w1, 17)))); + w0 = _mm512_loadu_si512(compressed + 26); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 10), + _mm512_slli_epi32(w0, 22)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 5)); +} + +/* we packed 512 28-bit values, touching 28 512-bit words, using 896 bytes */ +static void avx512unpackblock28(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 28 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(268435455); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 12), + _mm512_slli_epi32(w1, 20)))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 8), + _mm512_slli_epi32(w0, 24)))); + _mm512_storeu_si512(out + 7, _mm512_srli_epi32(w0, 4)); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512(out + 8, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 8), + _mm512_slli_epi32(w1, 24)))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w1, 4)); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 12), + _mm512_slli_epi32(w1, 20)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 8), + _mm512_slli_epi32(w0, 24)))); + _mm512_storeu_si512(out + 23, _mm512_srli_epi32(w0, 4)); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512(out + 24, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 25); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + w0 = _mm512_loadu_si512(compressed + 26); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + w1 = _mm512_loadu_si512(compressed + 27); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 8), + _mm512_slli_epi32(w1, 24)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 4)); +} + +/* we packed 512 29-bit values, touching 29 512-bit words, using 928 bytes */ +static void avx512unpackblock29(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 29 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(536870911); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 29), + _mm512_slli_epi32(w1, 3)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 23), + _mm512_slli_epi32(w1, 9)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 17), + _mm512_slli_epi32(w1, 15)))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 11), + _mm512_slli_epi32(w1, 21)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 8), + _mm512_slli_epi32(w0, 24)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 5), + _mm512_slli_epi32(w1, 27)))); + _mm512_storeu_si512(out + 10, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 2))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 31), + _mm512_slli_epi32(w0, 1)))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 25), + _mm512_slli_epi32(w0, 7)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 19), + _mm512_slli_epi32(w0, 13)))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 13), + _mm512_slli_epi32(w0, 19)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 10), + _mm512_slli_epi32(w1, 22)))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 7), + _mm512_slli_epi32(w0, 25)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 4), + _mm512_slli_epi32(w1, 28)))); + _mm512_storeu_si512(out + 21, + _mm512_and_si512(mask, _mm512_srli_epi32(w1, 1))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 21), + _mm512_slli_epi32(w1, 11)))); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + w1 = _mm512_loadu_si512(compressed + 25); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 15), + _mm512_slli_epi32(w1, 17)))); + w0 = _mm512_loadu_si512(compressed + 26); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + w1 = _mm512_loadu_si512(compressed + 27); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 9), + _mm512_slli_epi32(w1, 23)))); + w0 = _mm512_loadu_si512(compressed + 28); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 6), + _mm512_slli_epi32(w0, 26)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 3)); +} + +/* we packed 512 30-bit values, touching 30 512-bit words, using 960 bytes */ +static void avx512unpackblock30(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 30 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(1073741823); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 30), + _mm512_slli_epi32(w1, 2)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 26), + _mm512_slli_epi32(w1, 6)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 22), + _mm512_slli_epi32(w1, 10)))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 18), + _mm512_slli_epi32(w1, 14)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 14), + _mm512_slli_epi32(w1, 18)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 10), + _mm512_slli_epi32(w1, 22)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 8), + _mm512_slli_epi32(w0, 24)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 6), + _mm512_slli_epi32(w1, 26)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 4), + _mm512_slli_epi32(w0, 28)))); + _mm512_storeu_si512(out + 15, _mm512_srli_epi32(w0, 2)); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512(out + 16, _mm512_and_si512(mask, w1)); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 28), + _mm512_slli_epi32(w1, 4)))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 24), + _mm512_slli_epi32(w1, 8)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 20), + _mm512_slli_epi32(w1, 12)))); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 16), + _mm512_slli_epi32(w1, 16)))); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + w1 = _mm512_loadu_si512(compressed + 25); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 12), + _mm512_slli_epi32(w1, 20)))); + w0 = _mm512_loadu_si512(compressed + 26); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 10), + _mm512_slli_epi32(w0, 22)))); + w1 = _mm512_loadu_si512(compressed + 27); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 8), + _mm512_slli_epi32(w1, 24)))); + w0 = _mm512_loadu_si512(compressed + 28); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 6), + _mm512_slli_epi32(w0, 26)))); + w1 = _mm512_loadu_si512(compressed + 29); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 4), + _mm512_slli_epi32(w1, 28)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w1, 2)); +} + +/* we packed 512 31-bit values, touching 31 512-bit words, using 992 bytes */ +static void avx512unpackblock31(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 31 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + const __m512i mask = _mm512_set1_epi32(2147483647); + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, _mm512_and_si512(mask, w0)); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512( + out + 1, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 31), + _mm512_slli_epi32(w1, 1)))); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512( + out + 2, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 30), + _mm512_slli_epi32(w0, 2)))); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512( + out + 3, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 29), + _mm512_slli_epi32(w1, 3)))); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512( + out + 4, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 28), + _mm512_slli_epi32(w0, 4)))); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512( + out + 5, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 27), + _mm512_slli_epi32(w1, 5)))); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512( + out + 6, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 26), + _mm512_slli_epi32(w0, 6)))); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512( + out + 7, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 25), + _mm512_slli_epi32(w1, 7)))); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512( + out + 8, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 24), + _mm512_slli_epi32(w0, 8)))); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512( + out + 9, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 23), + _mm512_slli_epi32(w1, 9)))); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512( + out + 10, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 22), + _mm512_slli_epi32(w0, 10)))); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512( + out + 11, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 21), + _mm512_slli_epi32(w1, 11)))); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512( + out + 12, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 20), + _mm512_slli_epi32(w0, 12)))); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512( + out + 13, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 19), + _mm512_slli_epi32(w1, 13)))); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512( + out + 14, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 18), + _mm512_slli_epi32(w0, 14)))); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512( + out + 15, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 17), + _mm512_slli_epi32(w1, 15)))); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512( + out + 16, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 16), + _mm512_slli_epi32(w0, 16)))); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512( + out + 17, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 15), + _mm512_slli_epi32(w1, 17)))); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512( + out + 18, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 14), + _mm512_slli_epi32(w0, 18)))); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512( + out + 19, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 13), + _mm512_slli_epi32(w1, 19)))); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512( + out + 20, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 12), + _mm512_slli_epi32(w0, 20)))); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512( + out + 21, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 11), + _mm512_slli_epi32(w1, 21)))); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512( + out + 22, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 10), + _mm512_slli_epi32(w0, 22)))); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512( + out + 23, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 9), + _mm512_slli_epi32(w1, 23)))); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512( + out + 24, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 8), + _mm512_slli_epi32(w0, 24)))); + w1 = _mm512_loadu_si512(compressed + 25); + _mm512_storeu_si512( + out + 25, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 7), + _mm512_slli_epi32(w1, 25)))); + w0 = _mm512_loadu_si512(compressed + 26); + _mm512_storeu_si512( + out + 26, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 6), + _mm512_slli_epi32(w0, 26)))); + w1 = _mm512_loadu_si512(compressed + 27); + _mm512_storeu_si512( + out + 27, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 5), + _mm512_slli_epi32(w1, 27)))); + w0 = _mm512_loadu_si512(compressed + 28); + _mm512_storeu_si512( + out + 28, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 4), + _mm512_slli_epi32(w0, 28)))); + w1 = _mm512_loadu_si512(compressed + 29); + _mm512_storeu_si512( + out + 29, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w0, 3), + _mm512_slli_epi32(w1, 29)))); + w0 = _mm512_loadu_si512(compressed + 30); + _mm512_storeu_si512( + out + 30, + _mm512_and_si512(mask, _mm512_or_si512(_mm512_srli_epi32(w1, 2), + _mm512_slli_epi32(w0, 30)))); + _mm512_storeu_si512(out + 31, _mm512_srli_epi32(w0, 1)); +} + +/* we packed 512 32-bit values, touching 32 512-bit words, using 1024 bytes */ +static void avx512unpackblock32(const __m512i *compressed, uint32_t *pout) { + /* we are going to access 32 512-bit words */ + __m512i w0, w1; + __m512i *out = (__m512i *)pout; + w0 = _mm512_loadu_si512(compressed); + _mm512_storeu_si512(out + 0, w0); + w1 = _mm512_loadu_si512(compressed + 1); + _mm512_storeu_si512(out + 1, w1); + w0 = _mm512_loadu_si512(compressed + 2); + _mm512_storeu_si512(out + 2, w0); + w1 = _mm512_loadu_si512(compressed + 3); + _mm512_storeu_si512(out + 3, w1); + w0 = _mm512_loadu_si512(compressed + 4); + _mm512_storeu_si512(out + 4, w0); + w1 = _mm512_loadu_si512(compressed + 5); + _mm512_storeu_si512(out + 5, w1); + w0 = _mm512_loadu_si512(compressed + 6); + _mm512_storeu_si512(out + 6, w0); + w1 = _mm512_loadu_si512(compressed + 7); + _mm512_storeu_si512(out + 7, w1); + w0 = _mm512_loadu_si512(compressed + 8); + _mm512_storeu_si512(out + 8, w0); + w1 = _mm512_loadu_si512(compressed + 9); + _mm512_storeu_si512(out + 9, w1); + w0 = _mm512_loadu_si512(compressed + 10); + _mm512_storeu_si512(out + 10, w0); + w1 = _mm512_loadu_si512(compressed + 11); + _mm512_storeu_si512(out + 11, w1); + w0 = _mm512_loadu_si512(compressed + 12); + _mm512_storeu_si512(out + 12, w0); + w1 = _mm512_loadu_si512(compressed + 13); + _mm512_storeu_si512(out + 13, w1); + w0 = _mm512_loadu_si512(compressed + 14); + _mm512_storeu_si512(out + 14, w0); + w1 = _mm512_loadu_si512(compressed + 15); + _mm512_storeu_si512(out + 15, w1); + w0 = _mm512_loadu_si512(compressed + 16); + _mm512_storeu_si512(out + 16, w0); + w1 = _mm512_loadu_si512(compressed + 17); + _mm512_storeu_si512(out + 17, w1); + w0 = _mm512_loadu_si512(compressed + 18); + _mm512_storeu_si512(out + 18, w0); + w1 = _mm512_loadu_si512(compressed + 19); + _mm512_storeu_si512(out + 19, w1); + w0 = _mm512_loadu_si512(compressed + 20); + _mm512_storeu_si512(out + 20, w0); + w1 = _mm512_loadu_si512(compressed + 21); + _mm512_storeu_si512(out + 21, w1); + w0 = _mm512_loadu_si512(compressed + 22); + _mm512_storeu_si512(out + 22, w0); + w1 = _mm512_loadu_si512(compressed + 23); + _mm512_storeu_si512(out + 23, w1); + w0 = _mm512_loadu_si512(compressed + 24); + _mm512_storeu_si512(out + 24, w0); + w1 = _mm512_loadu_si512(compressed + 25); + _mm512_storeu_si512(out + 25, w1); + w0 = _mm512_loadu_si512(compressed + 26); + _mm512_storeu_si512(out + 26, w0); + w1 = _mm512_loadu_si512(compressed + 27); + _mm512_storeu_si512(out + 27, w1); + w0 = _mm512_loadu_si512(compressed + 28); + _mm512_storeu_si512(out + 28, w0); + w1 = _mm512_loadu_si512(compressed + 29); + _mm512_storeu_si512(out + 29, w1); + w0 = _mm512_loadu_si512(compressed + 30); + _mm512_storeu_si512(out + 30, w0); + w1 = _mm512_loadu_si512(compressed + 31); + _mm512_storeu_si512(out + 31, w1); +} + +static avx512packblockfnc avx512funcPackArr[] = { + &avx512packblock0, &avx512packblock1, &avx512packblock2, + &avx512packblock3, &avx512packblock4, &avx512packblock5, + &avx512packblock6, &avx512packblock7, &avx512packblock8, + &avx512packblock9, &avx512packblock10, &avx512packblock11, + &avx512packblock12, &avx512packblock13, &avx512packblock14, + &avx512packblock15, &avx512packblock16, &avx512packblock17, + &avx512packblock18, &avx512packblock19, &avx512packblock20, + &avx512packblock21, &avx512packblock22, &avx512packblock23, + &avx512packblock24, &avx512packblock25, &avx512packblock26, + &avx512packblock27, &avx512packblock28, &avx512packblock29, + &avx512packblock30, &avx512packblock31, &avx512packblock32}; +static avx512packblockfnc avx512funcPackMaskArr[] = { + &avx512packblockmask0, &avx512packblockmask1, &avx512packblockmask2, + &avx512packblockmask3, &avx512packblockmask4, &avx512packblockmask5, + &avx512packblockmask6, &avx512packblockmask7, &avx512packblockmask8, + &avx512packblockmask9, &avx512packblockmask10, &avx512packblockmask11, + &avx512packblockmask12, &avx512packblockmask13, &avx512packblockmask14, + &avx512packblockmask15, &avx512packblockmask16, &avx512packblockmask17, + &avx512packblockmask18, &avx512packblockmask19, &avx512packblockmask20, + &avx512packblockmask21, &avx512packblockmask22, &avx512packblockmask23, + &avx512packblockmask24, &avx512packblockmask25, &avx512packblockmask26, + &avx512packblockmask27, &avx512packblockmask28, &avx512packblockmask29, + &avx512packblockmask30, &avx512packblockmask31, &avx512packblockmask32}; +static avx512unpackblockfnc avx512funcUnpackArr[] = { + &avx512unpackblock0, &avx512unpackblock1, &avx512unpackblock2, + &avx512unpackblock3, &avx512unpackblock4, &avx512unpackblock5, + &avx512unpackblock6, &avx512unpackblock7, &avx512unpackblock8, + &avx512unpackblock9, &avx512unpackblock10, &avx512unpackblock11, + &avx512unpackblock12, &avx512unpackblock13, &avx512unpackblock14, + &avx512unpackblock15, &avx512unpackblock16, &avx512unpackblock17, + &avx512unpackblock18, &avx512unpackblock19, &avx512unpackblock20, + &avx512unpackblock21, &avx512unpackblock22, &avx512unpackblock23, + &avx512unpackblock24, &avx512unpackblock25, &avx512unpackblock26, + &avx512unpackblock27, &avx512unpackblock28, &avx512unpackblock29, + &avx512unpackblock30, &avx512unpackblock31, &avx512unpackblock32}; +/** avx512packing **/ + +/* reads 512 values from "in", writes "bit" 512-bit vectors to "out" */ +void avx512pack(const uint32_t *in, __m512i *out, const uint32_t bit) { + avx512funcPackMaskArr[bit](in, out); +} + +/* reads 512 values from "in", writes "bit" 512-bit vectors to "out" */ +void avx512packwithoutmask(const uint32_t *in, __m512i *out, + const uint32_t bit) { + avx512funcPackArr[bit](in, out); +} + +/* reads "bit" 512-bit vectors from "in", writes 512 values to "out" */ +void avx512unpack(const __m512i *in, uint32_t *out, const uint32_t bit) { + avx512funcUnpackArr[bit](in, out); +} + +#endif /* __AVX512F__ */ diff --git a/src/avxbitpacking.c b/src/avxbitpacking.c new file mode 100644 index 0000000..0f25b94 --- /dev/null +++ b/src/avxbitpacking.c @@ -0,0 +1,9920 @@ +#include "avxbitpacking.h" +#ifdef __AVX2__ + +static uint32_t maxbitas32int(const __m256i accumulator) { + const __m256i _tmp1 = + _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator); + const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1); + uint32_t ans1 = _mm256_extract_epi32(_tmp2, 0); + uint32_t ans2 = _mm256_extract_epi32(_tmp2, 4); + uint32_t ans = ans1 > ans2 ? ans1 : ans2; + return bits(ans); +} + +uint32_t avxmaxbits(const uint32_t *begin) { + const __m256i *pin = (const __m256i *)(begin); + __m256i accumulator = _mm256_lddqu_si256(pin); + uint32_t k = 1; + for (; 8 * k < AVXBlockSize; ++k) { + __m256i newvec = _mm256_lddqu_si256(pin + k); + accumulator = _mm256_or_si256(accumulator, newvec); + } + return maxbitas32int(accumulator); +} + +/** avxpacking **/ + +typedef void (*avxpackblockfnc)(const uint32_t *pin, __m256i *compressed); +typedef void (*avxunpackblockfnc)(const __m256i *compressed, uint32_t *pout); + +static void avxpackblock0(const uint32_t *pin, __m256i *compressed) { + (void)compressed; + (void)pin; /* we consumed 256 32-bit integers */ +} + +/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 + * bytes */ +static void avxpackblock1(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 1 256-bit word */ + __m256i w0; + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 15)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 31)); + _mm256_storeu_si256(compressed + 0, w0); +} + +/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 + * bytes */ +static void avxpackblock2(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 2 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 30)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 30)); + _mm256_storeu_si256(compressed + 1, w1); +} + +/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 + * bytes */ +static void avxpackblock3(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 3 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 15)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 27)); + tmp = _mm256_lddqu_si256(in + 10); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 1)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 28)); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 29)); + _mm256_storeu_si256(compressed + 2, w0); +} + +/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 + * bytes */ +static void avxpackblock4(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 4 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 28)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 28)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 28)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 28)); + _mm256_storeu_si256(compressed + 3, w1); +} + +/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 + * bytes */ +static void avxpackblock5(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 5 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 15)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 25)); + tmp = _mm256_lddqu_si256(in + 6); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 23)); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 26)); + tmp = _mm256_lddqu_si256(in + 19); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 24)); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 27)); + _mm256_storeu_si256(compressed + 4, w0); +} + +/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 + * bytes */ +static void avxpackblock6(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 6 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 24)); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 22)); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 26)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 24)); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 22)); + tmp = _mm256_lddqu_si256(in + 26); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 26)); + _mm256_storeu_si256(compressed + 5, w1); +} + +/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 + * bytes */ +static void avxpackblock7(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 7 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 21)); + tmp = _mm256_lddqu_si256(in + 4); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 24)); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 20)); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 23)); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 19)); + tmp = _mm256_lddqu_si256(in + 22); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 1)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 15)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 22)); + tmp = _mm256_lddqu_si256(in + 27); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 25)); + _mm256_storeu_si256(compressed + 6, w0); +} + +/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 + * bytes */ +static void avxpackblock8(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 8 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 24)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 24)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 24)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 24)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 24)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256(in + 20); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 24)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 24)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 24)); + _mm256_storeu_si256(compressed + 7, w1); +} + +/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 + * bytes */ +static void avxpackblock9(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 9 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 18)); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 22)); + tmp = _mm256_lddqu_si256(in + 7); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 17)); + tmp = _mm256_lddqu_si256(in + 10); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 21)); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + tmp = _mm256_lddqu_si256(in + 17); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 20)); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 15)); + tmp = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 1)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 19)); + tmp = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 23)); + _mm256_storeu_si256(compressed + 8, w0); +} + +/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 + * bytes */ +static void avxpackblock10(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 10 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 20)); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 18)); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 16)); + tmp = _mm256_lddqu_si256(in + 9); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 14)); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 22)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 20)); + tmp = _mm256_lddqu_si256(in + 19); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 18)); + tmp = _mm256_lddqu_si256(in + 22); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 16)); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 14)); + tmp = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 22)); + _mm256_storeu_si256(compressed + 9, w1); +} + +/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 + * bytes */ +static void avxpackblock11(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 11 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 11)); + tmp = _mm256_lddqu_si256(in + 2); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 1)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 12)); + tmp = _mm256_lddqu_si256(in + 5); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 13)); + tmp = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 14)); + tmp = _mm256_lddqu_si256(in + 11); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 15)); + tmp = _mm256_lddqu_si256(in + 14); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 5)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 17)); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 18)); + tmp = _mm256_lddqu_si256(in + 23); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 19)); + tmp = _mm256_lddqu_si256(in + 26); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 20)); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 21)); + _mm256_storeu_si256(compressed + 10, w0); +} + +/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 + * bytes */ +static void avxpackblock12(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 12 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 12)); + tmp = _mm256_lddqu_si256(in + 2); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 16)); + tmp = _mm256_lddqu_si256(in + 5); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 20)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 12)); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 16)); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 20)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 12)); + tmp = _mm256_lddqu_si256(in + 18); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 16)); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 20)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 12)); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 16)); + tmp = _mm256_lddqu_si256(in + 29); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 20)); + _mm256_storeu_si256(compressed + 11, w1); +} + +/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 + * bytes */ +static void avxpackblock13(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 13 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 13)); + tmp = _mm256_lddqu_si256(in + 2); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 7)); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 14)); + tmp = _mm256_lddqu_si256(in + 7); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 8)); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 21)); + w0 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 15)); + tmp = _mm256_lddqu_si256(in + 12); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 9)); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + tmp = _mm256_lddqu_si256(in + 17); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 10)); + tmp = _mm256_lddqu_si256(in + 19); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 17)); + tmp = _mm256_lddqu_si256(in + 22); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 11)); + tmp = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 18)); + tmp = _mm256_lddqu_si256(in + 27); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 12)); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 19)); + _mm256_storeu_si256(compressed + 12, w0); +} + +/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 + * bytes */ +static void avxpackblock14(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 14 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 14)); + tmp = _mm256_lddqu_si256(in + 2); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 10)); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 6)); + tmp = _mm256_lddqu_si256(in + 6); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 16)); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 12)); + tmp = _mm256_lddqu_si256(in + 11); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 8)); + tmp = _mm256_lddqu_si256(in + 13); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 18)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 14)); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 10)); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 6)); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 16)); + tmp = _mm256_lddqu_si256(in + 25); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 12)); + tmp = _mm256_lddqu_si256(in + 27); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 8)); + tmp = _mm256_lddqu_si256(in + 29); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 18)); + _mm256_storeu_si256(compressed + 13, w1); +} + +/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 + * bytes */ +static void avxpackblock15(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 15 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 15)); + tmp = _mm256_lddqu_si256(in + 2); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 13)); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 11)); + tmp = _mm256_lddqu_si256(in + 6); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 9)); + tmp = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 7)); + tmp = _mm256_lddqu_si256(in + 10); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 5)); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 3)); + tmp = _mm256_lddqu_si256(in + 14); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 1)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 16), 16)); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 14)); + tmp = _mm256_lddqu_si256(in + 19); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 12)); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 10)); + tmp = _mm256_lddqu_si256(in + 23); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 8)); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 6)); + tmp = _mm256_lddqu_si256(in + 27); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 4)); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 30), 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 17)); + _mm256_storeu_si256(compressed + 14, w0); +} + +/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 + * bytes */ +static void avxpackblock16(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 16 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256(in + 0); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 1), 16)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 16)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256(in + 4); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 16)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 16)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 16)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 16)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256(in + 12); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 16)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 16)); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 16)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 16)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 16)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 16)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 16)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 16)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 16)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 16)); + _mm256_storeu_si256(compressed + 15, w1); +} + +/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 + * bytes */ +static void avxpackblock17(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 17 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 2)); + tmp = _mm256_lddqu_si256(in + 3); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 4)); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 6)); + tmp = _mm256_lddqu_si256(in + 7); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 8)); + tmp = _mm256_lddqu_si256(in + 9); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 10)); + tmp = _mm256_lddqu_si256(in + 11); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 12)); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 14)); + tmp = _mm256_lddqu_si256(in + 15); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 1)); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 3)); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 5)); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 7)); + tmp = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 9)); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 11)); + tmp = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 13)); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 15)); + _mm256_storeu_si256(compressed + 16, w0); +} + +/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 + * bytes */ +static void avxpackblock18(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 18 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 4)); + tmp = _mm256_lddqu_si256(in + 3); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 8)); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 12)); + tmp = _mm256_lddqu_si256(in + 7); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 2)); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 6)); + tmp = _mm256_lddqu_si256(in + 12); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 10)); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 14)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256(in + 16); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 4)); + tmp = _mm256_lddqu_si256(in + 19); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 8)); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 12)); + tmp = _mm256_lddqu_si256(in + 23); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 2)); + tmp = _mm256_lddqu_si256(in + 26); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 6)); + tmp = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 10)); + tmp = _mm256_lddqu_si256(in + 30); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 14)); + _mm256_storeu_si256(compressed + 17, w1); +} + +/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 + * bytes */ +static void avxpackblock19(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 19 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 6)); + tmp = _mm256_lddqu_si256(in + 3); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 12)); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 5)); + tmp = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 11)); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 11); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 4)); + tmp = _mm256_lddqu_si256(in + 13); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 10)); + tmp = _mm256_lddqu_si256(in + 15); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 3)); + tmp = _mm256_lddqu_si256(in + 18); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 9)); + tmp = _mm256_lddqu_si256(in + 20); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 21); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 2)); + tmp = _mm256_lddqu_si256(in + 23); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 21)); + w0 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 8)); + tmp = _mm256_lddqu_si256(in + 25); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 1)); + tmp = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 7)); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 13)); + _mm256_storeu_si256(compressed + 18, w0); +} + +/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 + * bytes */ +static void avxpackblock20(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 20 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 8)); + tmp = _mm256_lddqu_si256(in + 3); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 4); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 4)); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 12)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256(in + 8); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 8)); + tmp = _mm256_lddqu_si256(in + 11); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 4)); + tmp = _mm256_lddqu_si256(in + 14); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 12)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256(in + 16); + tmp = _mm256_lddqu_si256(in + 17); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 8)); + tmp = _mm256_lddqu_si256(in + 19); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 4)); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 12)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256(in + 24); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 8)); + tmp = _mm256_lddqu_si256(in + 27); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 4)); + tmp = _mm256_lddqu_si256(in + 30); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 12)); + _mm256_storeu_si256(compressed + 19, w1); +} + +/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 + * bytes */ +static void avxpackblock21(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 21 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 2), 10)); + tmp = _mm256_lddqu_si256(in + 3); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 4); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 9)); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 7); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 8), 8)); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 10); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 7)); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 6)); + tmp = _mm256_lddqu_si256(in + 15); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 5)); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 19); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 20), 4)); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256(in + 22); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 3)); + tmp = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 25); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 13)); + w1 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 2)); + tmp = _mm256_lddqu_si256(in + 27); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 29), 1)); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 11)); + _mm256_storeu_si256(compressed + 20, w0); +} + +/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 + * bytes */ +static void avxpackblock22(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 22 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 2)); + tmp = _mm256_lddqu_si256(in + 4); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 5); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 4)); + tmp = _mm256_lddqu_si256(in + 7); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 6)); + tmp = _mm256_lddqu_si256(in + 10); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 11); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 8)); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 10)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256(in + 16); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 18); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 2)); + tmp = _mm256_lddqu_si256(in + 20); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256(in + 21); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 4)); + tmp = _mm256_lddqu_si256(in + 23); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 6)); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 27); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 8)); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256(in + 30); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 10)); + _mm256_storeu_si256(compressed + 21, w1); +} + +/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 + * bytes */ +static void avxpackblock23(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 23 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 23)); + w1 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 5)); + tmp = _mm256_lddqu_si256(in + 4); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 5); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 6); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 1)); + tmp = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 9); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 6)); + tmp = _mm256_lddqu_si256(in + 11); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 12); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 13); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 11)); + w0 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 14), 2)); + tmp = _mm256_lddqu_si256(in + 15); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 17), 7)); + tmp = _mm256_lddqu_si256(in + 18); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 19); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 21)); + w0 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 3)); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 23); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 24), 8)); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 26); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 27); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 13)); + w0 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 28), 4)); + tmp = _mm256_lddqu_si256(in + 29); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 9)); + _mm256_storeu_si256(compressed + 22, w0); +} + +/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 + * bytes */ +static void avxpackblock24(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 24 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 3), 8)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 4); + tmp = _mm256_lddqu_si256(in + 5); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 6); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 8)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256(in + 8); + tmp = _mm256_lddqu_si256(in + 9); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 11), 8)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256(in + 12); + tmp = _mm256_lddqu_si256(in + 13); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256(in + 14); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 8)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256(in + 16); + tmp = _mm256_lddqu_si256(in + 17); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 8)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256(in + 20); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 22); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 8)); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_lddqu_si256(in + 24); + tmp = _mm256_lddqu_si256(in + 25); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 8)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256(in + 28); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 30); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 8)); + _mm256_storeu_si256(compressed + 23, w1); +} + +/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 + * bytes */ +static void avxpackblock25(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 25 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 4), 4)); + tmp = _mm256_lddqu_si256(in + 5); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 6); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256(in + 7); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 15)); + w0 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 9), 1)); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 11); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 13), 5)); + tmp = _mm256_lddqu_si256(in + 14); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 15); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 9)); + w0 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 18), 2)); + tmp = _mm256_lddqu_si256(in + 19); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256(in + 20); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 21); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 13)); + w1 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 22), 6)); + tmp = _mm256_lddqu_si256(in + 23); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 17)); + w0 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256(in + 26); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 27), 3)); + tmp = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 29); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 7)); + _mm256_storeu_si256(compressed + 24, w0); +} + +/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 + * bytes */ +static void avxpackblock26(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 26 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 5), 2)); + tmp = _mm256_lddqu_si256(in + 6); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256(in + 7); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 4)); + tmp = _mm256_lddqu_si256(in + 11); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 6)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256(in + 16); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256(in + 18); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256(in + 19); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 2)); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 23); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256(in + 25); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 26), 4)); + tmp = _mm256_lddqu_si256(in + 27); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256(in + 30); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 6)); + _mm256_storeu_si256(compressed + 25, w1); +} + +/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 + * bytes */ +static void avxpackblock27(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 27 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 7)); + w1 = _mm256_srli_epi32(tmp, 25); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 6), 2)); + tmp = _mm256_lddqu_si256(in + 7); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 8); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 10); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 11); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 9)); + w0 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 12), 4)); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 15); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256(in + 17); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 19), 1)); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 22); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 23); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 13)); + w0 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 25), 3)); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 27); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256(in + 29); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 5)); + _mm256_storeu_si256(compressed + 26, w0); +} + +/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 + * bytes */ +static void avxpackblock28(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 28 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 7), 4)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256(in + 8); + tmp = _mm256_lddqu_si256(in + 9); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 10); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 11); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256(in + 12); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 13); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 14); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 4)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256(in + 16); + tmp = _mm256_lddqu_si256(in + 17); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 19); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256(in + 20); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 21); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 23), 4)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256(in + 24); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 26); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256(in + 27); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256(in + 30); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 4)); + _mm256_storeu_si256(compressed + 27, w1); +} + +/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 + * bytes */ +static void avxpackblock29(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 29 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 23)); + w1 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 7); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 9); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 5)); + w1 = _mm256_srli_epi32(tmp, 27); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 10), 2)); + tmp = _mm256_lddqu_si256(in + 11); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256(in + 12); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 13); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 14); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 15); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256(in + 16); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 13)); + w0 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 18); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256(in + 19); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 7)); + w0 = _mm256_srli_epi32(tmp, 25); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 4)); + w1 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 21), 1)); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256(in + 23); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 25); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256(in + 27); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256(in + 29); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 9)); + w1 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 3)); + _mm256_storeu_si256(compressed + 28, w0); +} + +/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 + * bytes */ +static void avxpackblock30(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 30 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 7); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 9); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256(in + 11); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 6)); + w1 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 4)); + w0 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 15), 2)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256(in + 16); + tmp = _mm256_lddqu_si256(in + 17); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 18); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256(in + 19); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 20); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 21); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256(in + 22); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256(in + 23); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 24); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256(in + 25); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256(in + 26); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256(in + 27); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256(in + 28); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_lddqu_si256(in + 29); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_lddqu_si256(in + 30); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 4)); + w1 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 2)); + _mm256_storeu_si256(compressed + 29, w1); +} + +/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 + * bytes */ +static void avxpackblock31(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 31 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256(in + 0); + tmp = _mm256_lddqu_si256(in + 1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256(in + 2); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256(in + 3); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256(in + 4); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256(in + 5); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256(in + 6); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256(in + 7); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256(in + 8); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256(in + 9); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 23)); + w1 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256(in + 10); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256(in + 11); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256(in + 12); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256(in + 13); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256(in + 14); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256(in + 15); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256(in + 16); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256(in + 17); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256(in + 18); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256(in + 19); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 13)); + w1 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256(in + 20); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256(in + 21); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256(in + 22); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256(in + 23); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 9)); + w1 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256(in + 24); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256(in + 25); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 7)); + w1 = _mm256_srli_epi32(tmp, 25); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256(in + 26); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256(in + 27); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 5)); + w1 = _mm256_srli_epi32(tmp, 27); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_lddqu_si256(in + 28); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 4)); + w0 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_lddqu_si256(in + 29); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 3)); + w1 = _mm256_srli_epi32(tmp, 29); + _mm256_storeu_si256(compressed + 28, w0); + tmp = _mm256_lddqu_si256(in + 30); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 2)); + w0 = _mm256_srli_epi32(tmp, 30); + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(_mm256_lddqu_si256(in + 31), 1)); + _mm256_storeu_si256(compressed + 30, w0); +} + +/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 + * bytes */ +static void avxpackblock32(const uint32_t *pin, __m256i *compressed) { + const __m256i *in = (const __m256i *)pin; + /* we are going to touch 32 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256(in + 0); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256(in + 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256(in + 2); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 3); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256(in + 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256(in + 5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256(in + 6); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256(in + 7); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_lddqu_si256(in + 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256(in + 9); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256(in + 10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256(in + 11); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256(in + 12); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256(in + 13); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256(in + 14); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256(in + 15); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_lddqu_si256(in + 16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_lddqu_si256(in + 17); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_lddqu_si256(in + 18); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_lddqu_si256(in + 19); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_lddqu_si256(in + 20); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256(in + 21); + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_lddqu_si256(in + 22); + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_lddqu_si256(in + 23); + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_lddqu_si256(in + 24); + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_lddqu_si256(in + 25); + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_lddqu_si256(in + 26); + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_lddqu_si256(in + 27); + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_lddqu_si256(in + 28); + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_lddqu_si256(in + 29); + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_lddqu_si256(in + 30); + _mm256_storeu_si256(compressed + 30, w0); + w1 = _mm256_lddqu_si256(in + 31); + _mm256_storeu_si256(compressed + 31, w1); +} + +static void avxpackblockmask0(const uint32_t *pin, __m256i *compressed) { + (void)compressed; + (void)pin; /* we consumed 256 32-bit integers */ +} + +/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 + * bytes */ +static void avxpackblockmask1(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 1 256-bit word */ + __m256i w0; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 1)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 2)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 3)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), 4)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 5)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), 6)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 7)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), 8)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 9)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 10)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 11)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 13)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 14)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 15)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 17)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 18)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 19)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 21)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 22)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 23)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 24)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 25)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 26)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 27)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 28)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 29)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 30)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 31)); + _mm256_storeu_si256(compressed + 0, w0); +} + +/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 + * bytes */ +static void avxpackblockmask2(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 2 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(3); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 2)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 4)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 6)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 10)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 14)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 18)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 22)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 24)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 26)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 28)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 30)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 2)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 6)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 10)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 12)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 14)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 18)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 20)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 22)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 24)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 26)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 28)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 30)); + _mm256_storeu_si256(compressed + 1, w1); +} + +/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 + * bytes */ +static void avxpackblockmask3(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 3 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(7); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 3)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 6)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 9)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 15)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 18)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 21)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), + 24)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 27)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 1)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 7)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 10)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 13)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 19)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 22)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 25)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 28)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 5)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 11)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 14)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 17)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 23)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 26)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 29)); + _mm256_storeu_si256(compressed + 2, w0); +} + +/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 + * bytes */ +static void avxpackblockmask4(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 4 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(15); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 4)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 24)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 28)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 12)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 20)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 24)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 28)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 24)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 28)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 12)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 20)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 24)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 28)); + _mm256_storeu_si256(compressed + 3, w1); +} + +/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 + * bytes */ +static void avxpackblockmask5(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 5 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(31); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 5)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), + 10)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 15)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 25)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 3)); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 13)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 18)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 23)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 1)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 6)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 11)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 21)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 26)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 9)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 14)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 19)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 24)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 7)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 17)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 22)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 27)); + _mm256_storeu_si256(compressed + 4, w0); +} + +/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 + * bytes */ +static void avxpackblockmask6(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 6 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(63); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 6)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 18)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), + 24)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 10)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 22)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 14)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 26)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 6)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 12)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 18)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 24)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 10)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 22)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 2)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 14)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 20)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 26)); + _mm256_storeu_si256(compressed + 5, w1); +} + +/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 + * bytes */ +static void avxpackblockmask7(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 7 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(127); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 7)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), + 14)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 21)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 3)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 10)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 17)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), + 24)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 6)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 13)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 20)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 2)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 9)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 23)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 5)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 19)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 1)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 15)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 22)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 11)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 18)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 25)); + _mm256_storeu_si256(compressed + 6, w0); +} + +/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 + * bytes */ +static void avxpackblockmask8(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 8 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(255); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 24)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 24)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 24)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 24)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 24)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 24)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 24)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 24)); + _mm256_storeu_si256(compressed + 7, w1); +} + +/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 + * bytes */ +static void avxpackblockmask9(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 9 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(511); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), 9)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), + 18)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 13)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 22)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 17)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 3)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 12)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 21)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 7)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 2)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 11)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 20)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 6)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 15)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 1)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 10)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 19)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 5)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 14)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 23)); + _mm256_storeu_si256(compressed + 8, w0); +} + +/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 + * bytes */ +static void avxpackblockmask10(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 10 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(1023); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), + 10)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), + 20)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 18)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 6)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 22)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 10)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 20)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 18)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 6)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 2)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 12)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 22)); + _mm256_storeu_si256(compressed + 9, w1); +} + +/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 + * bytes */ +static void avxpackblockmask11(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 11 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(2047); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), + 11)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 1)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 13)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 3)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 15)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 5)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 6)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 17)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 7)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 18)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 19)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 9)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 20)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 10)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 21)); + _mm256_storeu_si256(compressed + 10, w0); +} + +/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 + * bytes */ +static void avxpackblockmask12(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 12 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(4095); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 20)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 20)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 20)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 8)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 20)); + _mm256_storeu_si256(compressed + 11, w1); +} + +/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 + * bytes */ +static void avxpackblockmask13(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 13 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(8191); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), + 13)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 7)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 1)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 21)); + w0 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 15)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 9)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 3)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 17)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 11)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 5)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 18)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 6)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 19)); + _mm256_storeu_si256(compressed + 12, w0); +} + +/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 + * bytes */ +static void avxpackblockmask14(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 14 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(16383); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 2)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 18)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 4)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 18)); + _mm256_storeu_si256(compressed + 13, w1); +} + +/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 + * bytes */ +static void avxpackblockmask15(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 15 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(32767); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), + 15)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 13)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 11)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 9)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 7)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 5)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 3)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 1)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)), + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)), + 2)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 17)); + _mm256_storeu_si256(compressed + 14, w0); +} + +/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 + * bytes */ +static void avxpackblockmask16(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 16 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(65535); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)), + 16)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), + 16)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), + 16)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 16)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 16)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 16)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 16)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 16)); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 16)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 16)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 16)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 16)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 16)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 16)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 16)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 16)); + _mm256_storeu_si256(compressed + 15, w1); +} + +/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 + * bytes */ +static void avxpackblockmask17(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 17 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(131071); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 14)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 1)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 3)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 5)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 7)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 9)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 11)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 13)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 15)); + _mm256_storeu_si256(compressed + 16, w0); +} + +/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 + * bytes */ +static void avxpackblockmask18(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 18 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(262143); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 14)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 14)); + _mm256_storeu_si256(compressed + 17, w1); +} + +/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 + * bytes */ +static void avxpackblockmask19(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 19 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(524287); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 5)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), + 11)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 3)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 9)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 21)); + w0 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 1)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 7)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 13)); + _mm256_storeu_si256(compressed + 18, w0); +} + +/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 + * bytes */ +static void avxpackblockmask20(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 20 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(1048575); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), + 12)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 12)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 12)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 12)); + _mm256_storeu_si256(compressed + 19, w1); +} + +/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 + * bytes */ +static void avxpackblockmask21(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 21 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(2097151); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)), + 10)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 9)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)), 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 7)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 27)); + w0 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 5)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 3)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 13)); + w1 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)), + 1)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 11)); + _mm256_storeu_si256(compressed + 20, w0); +} + +/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 + * bytes */ +static void avxpackblockmask22(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 22 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(4194303); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 10)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 10)); + _mm256_storeu_si256(compressed + 21, w1); +} + +/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 + * bytes */ +static void avxpackblockmask23(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 23 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(8388607); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 23)); + w1 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 5)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 1)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 11)); + w0 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)), + 7)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 21)); + w0 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 3)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)), + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 13)); + w0 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 9)); + _mm256_storeu_si256(compressed + 22, w0); +} + +/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 + * bytes */ +static void avxpackblockmask24(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 24 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(16777215); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)), 8)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 8)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)), + 8)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 8)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 8)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 8)); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 8)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 8)); + _mm256_storeu_si256(compressed + 23, w1); +} + +/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 + * bytes */ +static void avxpackblockmask25(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 25 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(33554431); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)), 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 15)); + w0 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)), 1)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)), + 5)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 9)); + w0 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 13)); + w1 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)), + 6)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 17)); + w0 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)), + 3)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 7)); + _mm256_storeu_si256(compressed + 24, w0); +} + +/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 + * bytes */ +static void avxpackblockmask26(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 26 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(67108863); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)), 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 6)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 6)); + _mm256_storeu_si256(compressed + 25, w1); +} + +/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 + * bytes */ +static void avxpackblockmask27(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 27 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(134217727); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 7)); + w1 = _mm256_srli_epi32(tmp, 25); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256( + w1, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)), 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 29)); + w0 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 9)); + w0 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)), + 4)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)), + 1)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 23)); + w0 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 13)); + w0 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)), + 3)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 5)); + _mm256_storeu_si256(compressed + 26, w0); +} + +/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 + * bytes */ +static void avxpackblockmask28(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 28 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(268435455); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256( + w0, + _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)), 4)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 4)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)), + 4)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 4)); + _mm256_storeu_si256(compressed + 27, w1); +} + +/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 + * bytes */ +static void avxpackblockmask29(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 29 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(536870911); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 23)); + w1 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 5)); + w1 = _mm256_srli_epi32(tmp, 27); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)), + 2)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 31)); + w0 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 25)); + w0 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 19)); + w0 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 13)); + w0 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 7)); + w0 = _mm256_srli_epi32(tmp, 25); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 4)); + w1 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)), + 1)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 9)); + w1 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 3)); + _mm256_storeu_si256(compressed + 28, w0); +} + +/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 + * bytes */ +static void avxpackblockmask30(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 30 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(1073741823); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 30)); + w1 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 26)); + w1 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 22)); + w1 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 18)); + w1 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 14)); + w1 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 10)); + w1 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 6)); + w1 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 4)); + w0 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)), + 2)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 28)); + w1 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 24)); + w1 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 20)); + w1 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 16)); + w1 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 12)); + w1 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 8)); + w1 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 4)); + w1 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_or_si256( + w1, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 2)); + _mm256_storeu_si256(compressed + 29, w1); +} + +/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 + * bytes */ +static void avxpackblockmask31(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 31 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + const __m256i mask = _mm256_set1_epi32(2147483647); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 0)); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 1)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 31)); + w1 = _mm256_srli_epi32(tmp, 1); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 2)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 30)); + w0 = _mm256_srli_epi32(tmp, 2); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 3)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 29)); + w1 = _mm256_srli_epi32(tmp, 3); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 4)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 28)); + w0 = _mm256_srli_epi32(tmp, 4); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 5)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 27)); + w1 = _mm256_srli_epi32(tmp, 5); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 6)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 26)); + w0 = _mm256_srli_epi32(tmp, 6); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 7)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 25)); + w1 = _mm256_srli_epi32(tmp, 7); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 8)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 24)); + w0 = _mm256_srli_epi32(tmp, 8); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 9)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 23)); + w1 = _mm256_srli_epi32(tmp, 9); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 10)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 22)); + w0 = _mm256_srli_epi32(tmp, 10); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 11)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 21)); + w1 = _mm256_srli_epi32(tmp, 11); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 12)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 20)); + w0 = _mm256_srli_epi32(tmp, 12); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 13)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 19)); + w1 = _mm256_srli_epi32(tmp, 13); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 14)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 18)); + w0 = _mm256_srli_epi32(tmp, 14); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 15)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 17)); + w1 = _mm256_srli_epi32(tmp, 15); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 16)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 16)); + w0 = _mm256_srli_epi32(tmp, 16); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 17)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 15)); + w1 = _mm256_srli_epi32(tmp, 17); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 18)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 14)); + w0 = _mm256_srli_epi32(tmp, 18); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 19)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 13)); + w1 = _mm256_srli_epi32(tmp, 19); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 20)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 12)); + w0 = _mm256_srli_epi32(tmp, 20); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 21)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 11)); + w1 = _mm256_srli_epi32(tmp, 21); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 22)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 10)); + w0 = _mm256_srli_epi32(tmp, 22); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 23)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 9)); + w1 = _mm256_srli_epi32(tmp, 23); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 24)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 8)); + w0 = _mm256_srli_epi32(tmp, 24); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 25)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 7)); + w1 = _mm256_srli_epi32(tmp, 25); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 26)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 6)); + w0 = _mm256_srli_epi32(tmp, 26); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 27)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 5)); + w1 = _mm256_srli_epi32(tmp, 27); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 28)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 4)); + w0 = _mm256_srli_epi32(tmp, 28); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 29)); + w0 = _mm256_or_si256(w0, _mm256_slli_epi32(tmp, 3)); + w1 = _mm256_srli_epi32(tmp, 29); + _mm256_storeu_si256(compressed + 28, w0); + tmp = _mm256_and_si256(mask, _mm256_lddqu_si256(in + 30)); + w1 = _mm256_or_si256(w1, _mm256_slli_epi32(tmp, 2)); + w0 = _mm256_srli_epi32(tmp, 30); + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_or_si256( + w0, _mm256_slli_epi32(_mm256_and_si256(mask, _mm256_lddqu_si256(in + 31)), + 1)); + _mm256_storeu_si256(compressed + 30, w0); +} + +/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 + * bytes */ +static void avxpackblockmask32(const uint32_t *pin, __m256i *compressed) { + /* we are going to touch 32 256-bit words */ + __m256i w0, w1; + const __m256i *in = (const __m256i *)pin; + w0 = _mm256_lddqu_si256(in + 0); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256(in + 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256(in + 2); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256(in + 3); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256(in + 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256(in + 5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256(in + 6); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256(in + 7); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_lddqu_si256(in + 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256(in + 9); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256(in + 10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256(in + 11); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256(in + 12); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256(in + 13); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256(in + 14); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256(in + 15); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_lddqu_si256(in + 16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_lddqu_si256(in + 17); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_lddqu_si256(in + 18); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_lddqu_si256(in + 19); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_lddqu_si256(in + 20); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256(in + 21); + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_lddqu_si256(in + 22); + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_lddqu_si256(in + 23); + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_lddqu_si256(in + 24); + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_lddqu_si256(in + 25); + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_lddqu_si256(in + 26); + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_lddqu_si256(in + 27); + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_lddqu_si256(in + 28); + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_lddqu_si256(in + 29); + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_lddqu_si256(in + 30); + _mm256_storeu_si256(compressed + 30, w0); + w1 = _mm256_lddqu_si256(in + 31); + _mm256_storeu_si256(compressed + 31, w1); +} + +static void avxunpackblock0(const __m256i *compressed, uint32_t *pout) { + (void)compressed; + memset(pout, 0, 256); +} + +/* we packed 256 1-bit values, touching 1 256-bit words, using 16 bytes */ +static void avxunpackblock1(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 1 256-bit word */ + __m256i w0; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(1); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 1))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 3))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 7))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 9))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 11))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 13))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 15))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 17))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 19))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 21))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 22))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 23))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 24))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 25))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 26))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 27))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 28))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 29))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 30))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 31)); +} + +/* we packed 256 2-bit values, touching 2 256-bit words, using 32 bytes */ +static void avxunpackblock2(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 2 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(3); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 22))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 24))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 26))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 28))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 30)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 14))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 18))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 20))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 22))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 24))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 26))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 28))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 30)); +} + +/* we packed 256 3-bit values, touching 3 256-bit words, using 48 bytes */ +static void avxunpackblock3(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 3 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(7); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 3))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 9))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 15))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 21))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 24))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 27))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 7))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 13))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 19))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 22))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 25))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 28))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 11))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 17))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 23))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 26))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 29)); +} + +/* we packed 256 4-bit values, touching 4 256-bit words, using 64 bytes */ +static void avxunpackblock4(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 4 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(15); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 24))); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32(w0, 28)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256(out + 8, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 20))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 24))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w1, 28)); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 24))); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32(w0, 28)); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256(out + 24, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 20))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 24))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 28)); +} + +/* we packed 256 5-bit values, touching 5 256-bit words, using 80 bytes */ +static void avxunpackblock5(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 5 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(31); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 15))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 25))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 13))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 18))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 23))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 1))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 11))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 21))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 26))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 31), + _mm256_slli_epi32(w1, 1)))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 14))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 19))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 24))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 7))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 17))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 22))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 27)); +} + +/* we packed 256 6-bit values, touching 6 256-bit words, using 96 bytes */ +static void avxunpackblock6(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 6 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(63); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 24))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 22))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 26)); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 18))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 24))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 22))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 14))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 20))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 26)); +} + +/* we packed 256 7-bit values, touching 7 256-bit words, using 112 bytes */ +static void avxunpackblock7(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 7 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(127); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 7))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 21))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 17))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 24))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 13))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 23))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 19))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 15))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 22))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 11))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 25)); +} + +/* we packed 256 8-bit values, touching 8 256-bit words, using 128 bytes */ +static void avxunpackblock8(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 8 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(255); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 3, _mm256_srli_epi32(w0, 24)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256(out + 4, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32(w1, 24)); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256(out + 8, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 11, _mm256_srli_epi32(w0, 24)); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256(out + 12, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w1, 24)); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 19, _mm256_srli_epi32(w0, 24)); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256(out + 20, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32(w1, 24)); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256(out + 24, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + _mm256_storeu_si256(out + 27, _mm256_srli_epi32(w0, 24)); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256(out + 28, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 24)); +} + +/* we packed 256 9-bit values, touching 9 256-bit words, using 144 bytes */ +static void avxunpackblock9(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 9 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(511); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 9))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 13))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 22))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 17))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 21))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 7))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 25), + _mm256_slli_epi32(w1, 7)))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 11))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 20))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 15))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 19))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 23)); +} + +/* we packed 256 10-bit values, touching 10 256-bit words, using 160 bytes */ +static void avxunpackblock10(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 10 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(1023); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 20))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 18))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 14))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 22)); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 20))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 22)); +} + +/* we packed 256 11-bit values, touching 11 256-bit words, using 176 bytes */ +static void avxunpackblock11(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 11 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(2047); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 11))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 13))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 14))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 25), + _mm256_slli_epi32(w0, 7)))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 15))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 5))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 27), + _mm256_slli_epi32(w0, 5)))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 17))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 7))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 18))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 19))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 20))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 21)); +} + +/* we packed 256 12-bit values, touching 12 256-bit words, using 192 bytes */ +static void avxunpackblock12(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 12 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(4095); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32(w0, 20)); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256(out + 8, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w1, 20)); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32(w0, 20)); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256(out + 24, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 20)); +} + +/* we packed 256 13-bit values, touching 13 256-bit words, using 208 bytes */ +static void avxunpackblock13(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 13 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(8191); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 13))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 7))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 1))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 21), + _mm256_slli_epi32(w0, 11)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 15))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 3))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 29), + _mm256_slli_epi32(w1, 3)))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 17))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 11))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 18))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 31), + _mm256_slli_epi32(w1, 1)))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 25), + _mm256_slli_epi32(w0, 7)))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 19)); +} + +/* we packed 256 14-bit values, touching 14 256-bit words, using 224 bytes */ +static void avxunpackblock14(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 14 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(16383); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 18)); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 14))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 16))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 18)); +} + +/* we packed 256 15-bit values, touching 15 256-bit words, using 240 bytes */ +static void avxunpackblock15(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 15 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(32767); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 15))); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 13))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 11))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 7))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 5))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 3))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 18), + _mm256_slli_epi32(w1, 14)))); + _mm256_storeu_si256(out + 15, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + _mm256_storeu_si256(out + 16, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 16))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 14))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 29), + _mm256_slli_epi32(w1, 3)))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 27), + _mm256_slli_epi32(w0, 5)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 25), + _mm256_slli_epi32(w1, 7)))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 21), + _mm256_slli_epi32(w1, 11)))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 19), + _mm256_slli_epi32(w0, 13)))); + _mm256_storeu_si256(out + 30, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 17)); +} + +/* we packed 256 16-bit values, touching 16 256-bit words, using 256 bytes */ +static void avxunpackblock16(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 16 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(65535); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 1, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256(out + 2, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 3, _mm256_srli_epi32(w1, 16)); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256(out + 4, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 5, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256(out + 6, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32(w1, 16)); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256(out + 8, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 9, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256(out + 10, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 11, _mm256_srli_epi32(w1, 16)); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256(out + 12, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 13, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256(out + 14, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w1, 16)); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 17, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256(out + 18, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 19, _mm256_srli_epi32(w1, 16)); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256(out + 20, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 21, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256(out + 22, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32(w1, 16)); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256(out + 24, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 25, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256(out + 26, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 27, _mm256_srli_epi32(w1, 16)); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256(out + 28, _mm256_and_si256(mask, w0)); + _mm256_storeu_si256(out + 29, _mm256_srli_epi32(w0, 16)); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256(out + 30, _mm256_and_si256(mask, w1)); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 16)); +} + +/* we packed 256 17-bit values, touching 17 256-bit words, using 272 bytes */ +static void avxunpackblock17(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 17 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(131071); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 17), + _mm256_slli_epi32(w1, 15)))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 19), + _mm256_slli_epi32(w0, 13)))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 21), + _mm256_slli_epi32(w1, 11)))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 25), + _mm256_slli_epi32(w1, 7)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 27), + _mm256_slli_epi32(w0, 5)))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 29), + _mm256_slli_epi32(w1, 3)))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 14))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 3))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 5))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 7))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 11))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 13))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 15)); +} + +/* we packed 256 18-bit values, touching 18 256-bit words, using 288 bytes */ +static void avxunpackblock18(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 18 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(262143); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 18), + _mm256_slli_epi32(w1, 14)))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 12))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 14)); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 14)); +} + +/* we packed 256 19-bit values, touching 19 256-bit words, using 304 bytes */ +static void avxunpackblock19(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 19 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(524287); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 19), + _mm256_slli_epi32(w1, 13)))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 25), + _mm256_slli_epi32(w0, 7)))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 12))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 31), + _mm256_slli_epi32(w1, 1)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 11))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 17), + _mm256_slli_epi32(w1, 15)))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 10))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 29), + _mm256_slli_epi32(w1, 3)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 3))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 15), + _mm256_slli_epi32(w1, 17)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 21), + _mm256_slli_epi32(w0, 11)))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 1))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 7))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 13)); +} + +/* we packed 256 20-bit values, touching 20 256-bit words, using 320 bytes */ +static void avxunpackblock20(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 20 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(1048575); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32(w0, 12)); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256(out + 8, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w1, 12)); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32(w0, 12)); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256(out + 24, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 12)); +} + +/* we packed 256 21-bit values, touching 21 256-bit words, using 336 bytes */ +static void avxunpackblock21(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 21 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(2097151); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 21), + _mm256_slli_epi32(w1, 11)))); + _mm256_storeu_si256(out + 2, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 10))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 9))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 19), + _mm256_slli_epi32(w1, 13)))); + _mm256_storeu_si256(out + 8, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 18), + _mm256_slli_epi32(w1, 14)))); + _mm256_storeu_si256(out + 11, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 7))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 17), + _mm256_slli_epi32(w1, 15)))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 27), + _mm256_slli_epi32(w0, 5)))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 5))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 15), + _mm256_slli_epi32(w1, 17)))); + _mm256_storeu_si256(out + 20, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 25), + _mm256_slli_epi32(w0, 7)))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 14), + _mm256_slli_epi32(w1, 18)))); + _mm256_storeu_si256(out + 23, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 13), + _mm256_slli_epi32(w1, 19)))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 12), + _mm256_slli_epi32(w1, 20)))); + _mm256_storeu_si256(out + 29, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 11)); +} + +/* we packed 256 22-bit values, touching 22 256-bit words, using 352 bytes */ +static void avxunpackblock22(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 22 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(4194303); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 6))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 8))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 10)); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 12), + _mm256_slli_epi32(w1, 20)))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 14), + _mm256_slli_epi32(w1, 18)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 18), + _mm256_slli_epi32(w1, 14)))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 10)); +} + +/* we packed 256 23-bit values, touching 23 256-bit words, using 368 bytes */ +static void avxunpackblock23(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 23 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(8388607); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 23), + _mm256_slli_epi32(w1, 9)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + _mm256_storeu_si256(out + 3, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 19), + _mm256_slli_epi32(w0, 13)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 10), + _mm256_slli_epi32(w1, 22)))); + _mm256_storeu_si256(out + 7, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 15), + _mm256_slli_epi32(w1, 17)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 11), + _mm256_slli_epi32(w0, 21)))); + _mm256_storeu_si256(out + 14, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 25), + _mm256_slli_epi32(w1, 7)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 17, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 7))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 21), + _mm256_slli_epi32(w0, 11)))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 12), + _mm256_slli_epi32(w1, 20)))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 17), + _mm256_slli_epi32(w1, 15)))); + _mm256_storeu_si256(out + 24, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 8))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 13), + _mm256_slli_epi32(w0, 19)))); + _mm256_storeu_si256(out + 28, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 9)); +} + +/* we packed 256 24-bit values, touching 24 256-bit words, using 384 bytes */ +static void avxunpackblock24(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 24 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(16777215); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 3, _mm256_srli_epi32(w0, 8)); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256(out + 4, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32(w1, 8)); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256(out + 8, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 11, _mm256_srli_epi32(w0, 8)); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256(out + 12, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w1, 8)); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 19, _mm256_srli_epi32(w0, 8)); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256(out + 20, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32(w1, 8)); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256(out + 24, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + _mm256_storeu_si256(out + 27, _mm256_srli_epi32(w0, 8)); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256(out + 28, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 8)); +} + +/* we packed 256 25-bit values, touching 25 256-bit words, using 400 bytes */ +static void avxunpackblock25(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 25 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(33554431); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 25), + _mm256_slli_epi32(w1, 7)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 11), + _mm256_slli_epi32(w1, 21)))); + _mm256_storeu_si256(out + 4, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 15), + _mm256_slli_epi32(w0, 17)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 8), + _mm256_slli_epi32(w1, 24)))); + _mm256_storeu_si256(out + 9, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 19), + _mm256_slli_epi32(w1, 13)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + _mm256_storeu_si256(out + 13, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 5))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 9), + _mm256_slli_epi32(w0, 23)))); + _mm256_storeu_si256(out + 18, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 13), + _mm256_slli_epi32(w1, 19)))); + _mm256_storeu_si256(out + 22, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 6))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 17), + _mm256_slli_epi32(w0, 15)))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 10), + _mm256_slli_epi32(w1, 22)))); + _mm256_storeu_si256(out + 27, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 21), + _mm256_slli_epi32(w1, 11)))); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 7)); +} + +/* we packed 256 26-bit values, touching 26 256-bit words, using 416 bytes */ +static void avxunpackblock26(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 26 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(67108863); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 14), + _mm256_slli_epi32(w1, 18)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 8), + _mm256_slli_epi32(w0, 24)))); + _mm256_storeu_si256(out + 5, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 2))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 10), + _mm256_slli_epi32(w0, 22)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 18), + _mm256_slli_epi32(w1, 14)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 6)); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 8), + _mm256_slli_epi32(w1, 24)))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 10), + _mm256_slli_epi32(w1, 22)))); + _mm256_storeu_si256(out + 26, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 4))); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + w1 = _mm256_lddqu_si256(compressed + 25); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 12), + _mm256_slli_epi32(w1, 20)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 6)); +} + +/* we packed 256 27-bit values, touching 27 256-bit words, using 432 bytes */ +static void avxunpackblock27(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 27 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(134217727); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 17), + _mm256_slli_epi32(w1, 15)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 7), + _mm256_slli_epi32(w1, 25)))); + _mm256_storeu_si256(out + 6, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 29), + _mm256_slli_epi32(w0, 3)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 19), + _mm256_slli_epi32(w0, 13)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 14), + _mm256_slli_epi32(w1, 18)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 9), + _mm256_slli_epi32(w0, 23)))); + _mm256_storeu_si256(out + 12, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 4))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 31), + _mm256_slli_epi32(w1, 1)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 21), + _mm256_slli_epi32(w1, 11)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 11), + _mm256_slli_epi32(w1, 21)))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 6), + _mm256_slli_epi32(w0, 26)))); + _mm256_storeu_si256(out + 19, + _mm256_and_si256(mask, _mm256_srli_epi32(w0, 1))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 23), + _mm256_slli_epi32(w0, 9)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 18), + _mm256_slli_epi32(w1, 14)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 13), + _mm256_slli_epi32(w0, 19)))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 8), + _mm256_slli_epi32(w1, 24)))); + _mm256_storeu_si256(out + 25, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 3))); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 25), + _mm256_slli_epi32(w1, 7)))); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 25); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 15), + _mm256_slli_epi32(w1, 17)))); + w0 = _mm256_lddqu_si256(compressed + 26); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 10), + _mm256_slli_epi32(w0, 22)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 5)); +} + +/* we packed 256 28-bit values, touching 28 256-bit words, using 448 bytes */ +static void avxunpackblock28(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 28 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(268435455); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 12), + _mm256_slli_epi32(w1, 20)))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 8), + _mm256_slli_epi32(w0, 24)))); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32(w0, 4)); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256(out + 8, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 8), + _mm256_slli_epi32(w1, 24)))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w1, 4)); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 12), + _mm256_slli_epi32(w1, 20)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 8), + _mm256_slli_epi32(w0, 24)))); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32(w0, 4)); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256(out + 24, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 25); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + w0 = _mm256_lddqu_si256(compressed + 26); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + w1 = _mm256_lddqu_si256(compressed + 27); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 8), + _mm256_slli_epi32(w1, 24)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 4)); +} + +/* we packed 256 29-bit values, touching 29 256-bit words, using 464 bytes */ +static void avxunpackblock29(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 29 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(536870911); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 29), + _mm256_slli_epi32(w1, 3)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 23), + _mm256_slli_epi32(w1, 9)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 17), + _mm256_slli_epi32(w1, 15)))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 11), + _mm256_slli_epi32(w1, 21)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 8), + _mm256_slli_epi32(w0, 24)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 5), + _mm256_slli_epi32(w1, 27)))); + _mm256_storeu_si256(out + 10, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 2))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 31), + _mm256_slli_epi32(w0, 1)))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 25), + _mm256_slli_epi32(w0, 7)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 19), + _mm256_slli_epi32(w0, 13)))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 13), + _mm256_slli_epi32(w0, 19)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 10), + _mm256_slli_epi32(w1, 22)))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 7), + _mm256_slli_epi32(w0, 25)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 4), + _mm256_slli_epi32(w1, 28)))); + _mm256_storeu_si256(out + 21, + _mm256_and_si256(mask, _mm256_srli_epi32(w1, 1))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 21), + _mm256_slli_epi32(w1, 11)))); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + w1 = _mm256_lddqu_si256(compressed + 25); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 15), + _mm256_slli_epi32(w1, 17)))); + w0 = _mm256_lddqu_si256(compressed + 26); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + w1 = _mm256_lddqu_si256(compressed + 27); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 9), + _mm256_slli_epi32(w1, 23)))); + w0 = _mm256_lddqu_si256(compressed + 28); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 6), + _mm256_slli_epi32(w0, 26)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 3)); +} + +/* we packed 256 30-bit values, touching 30 256-bit words, using 480 bytes */ +static void avxunpackblock30(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 30 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(1073741823); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 30), + _mm256_slli_epi32(w1, 2)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 26), + _mm256_slli_epi32(w1, 6)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 22), + _mm256_slli_epi32(w1, 10)))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 18), + _mm256_slli_epi32(w1, 14)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 14), + _mm256_slli_epi32(w1, 18)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 10), + _mm256_slli_epi32(w1, 22)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 8), + _mm256_slli_epi32(w0, 24)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 6), + _mm256_slli_epi32(w1, 26)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 4), + _mm256_slli_epi32(w0, 28)))); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32(w0, 2)); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256(out + 16, _mm256_and_si256(mask, w1)); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 28), + _mm256_slli_epi32(w1, 4)))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 24), + _mm256_slli_epi32(w1, 8)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 20), + _mm256_slli_epi32(w1, 12)))); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 16), + _mm256_slli_epi32(w1, 16)))); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + w1 = _mm256_lddqu_si256(compressed + 25); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 12), + _mm256_slli_epi32(w1, 20)))); + w0 = _mm256_lddqu_si256(compressed + 26); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 10), + _mm256_slli_epi32(w0, 22)))); + w1 = _mm256_lddqu_si256(compressed + 27); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 8), + _mm256_slli_epi32(w1, 24)))); + w0 = _mm256_lddqu_si256(compressed + 28); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 6), + _mm256_slli_epi32(w0, 26)))); + w1 = _mm256_lddqu_si256(compressed + 29); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 4), + _mm256_slli_epi32(w1, 28)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w1, 2)); +} + +/* we packed 256 31-bit values, touching 31 256-bit words, using 496 bytes */ +static void avxunpackblock31(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 31 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + const __m256i mask = _mm256_set1_epi32(2147483647); + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256(mask, w0)); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256( + out + 1, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 31), + _mm256_slli_epi32(w1, 1)))); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256( + out + 2, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 30), + _mm256_slli_epi32(w0, 2)))); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256( + out + 3, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 29), + _mm256_slli_epi32(w1, 3)))); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256( + out + 4, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 28), + _mm256_slli_epi32(w0, 4)))); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256( + out + 5, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 27), + _mm256_slli_epi32(w1, 5)))); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256( + out + 6, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 26), + _mm256_slli_epi32(w0, 6)))); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256( + out + 7, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 25), + _mm256_slli_epi32(w1, 7)))); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256( + out + 8, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 24), + _mm256_slli_epi32(w0, 8)))); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256( + out + 9, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 23), + _mm256_slli_epi32(w1, 9)))); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256( + out + 10, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 22), + _mm256_slli_epi32(w0, 10)))); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256( + out + 11, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 21), + _mm256_slli_epi32(w1, 11)))); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256( + out + 12, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 20), + _mm256_slli_epi32(w0, 12)))); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256( + out + 13, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 19), + _mm256_slli_epi32(w1, 13)))); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256( + out + 14, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 18), + _mm256_slli_epi32(w0, 14)))); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256( + out + 15, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 17), + _mm256_slli_epi32(w1, 15)))); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256( + out + 16, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 16), + _mm256_slli_epi32(w0, 16)))); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256( + out + 17, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 15), + _mm256_slli_epi32(w1, 17)))); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256( + out + 18, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 14), + _mm256_slli_epi32(w0, 18)))); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256( + out + 19, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 13), + _mm256_slli_epi32(w1, 19)))); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256( + out + 20, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 12), + _mm256_slli_epi32(w0, 20)))); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256( + out + 21, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 11), + _mm256_slli_epi32(w1, 21)))); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256( + out + 22, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 10), + _mm256_slli_epi32(w0, 22)))); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256( + out + 23, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 9), + _mm256_slli_epi32(w1, 23)))); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256( + out + 24, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 8), + _mm256_slli_epi32(w0, 24)))); + w1 = _mm256_lddqu_si256(compressed + 25); + _mm256_storeu_si256( + out + 25, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 7), + _mm256_slli_epi32(w1, 25)))); + w0 = _mm256_lddqu_si256(compressed + 26); + _mm256_storeu_si256( + out + 26, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 6), + _mm256_slli_epi32(w0, 26)))); + w1 = _mm256_lddqu_si256(compressed + 27); + _mm256_storeu_si256( + out + 27, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 5), + _mm256_slli_epi32(w1, 27)))); + w0 = _mm256_lddqu_si256(compressed + 28); + _mm256_storeu_si256( + out + 28, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 4), + _mm256_slli_epi32(w0, 28)))); + w1 = _mm256_lddqu_si256(compressed + 29); + _mm256_storeu_si256( + out + 29, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w0, 3), + _mm256_slli_epi32(w1, 29)))); + w0 = _mm256_lddqu_si256(compressed + 30); + _mm256_storeu_si256( + out + 30, + _mm256_and_si256(mask, _mm256_or_si256(_mm256_srli_epi32(w1, 2), + _mm256_slli_epi32(w0, 30)))); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32(w0, 1)); +} + +/* we packed 256 32-bit values, touching 32 256-bit words, using 512 bytes */ +static void avxunpackblock32(const __m256i *compressed, uint32_t *pout) { + /* we are going to access 32 256-bit words */ + __m256i w0, w1; + __m256i *out = (__m256i *)pout; + w0 = _mm256_lddqu_si256(compressed); + _mm256_storeu_si256(out + 0, w0); + w1 = _mm256_lddqu_si256(compressed + 1); + _mm256_storeu_si256(out + 1, w1); + w0 = _mm256_lddqu_si256(compressed + 2); + _mm256_storeu_si256(out + 2, w0); + w1 = _mm256_lddqu_si256(compressed + 3); + _mm256_storeu_si256(out + 3, w1); + w0 = _mm256_lddqu_si256(compressed + 4); + _mm256_storeu_si256(out + 4, w0); + w1 = _mm256_lddqu_si256(compressed + 5); + _mm256_storeu_si256(out + 5, w1); + w0 = _mm256_lddqu_si256(compressed + 6); + _mm256_storeu_si256(out + 6, w0); + w1 = _mm256_lddqu_si256(compressed + 7); + _mm256_storeu_si256(out + 7, w1); + w0 = _mm256_lddqu_si256(compressed + 8); + _mm256_storeu_si256(out + 8, w0); + w1 = _mm256_lddqu_si256(compressed + 9); + _mm256_storeu_si256(out + 9, w1); + w0 = _mm256_lddqu_si256(compressed + 10); + _mm256_storeu_si256(out + 10, w0); + w1 = _mm256_lddqu_si256(compressed + 11); + _mm256_storeu_si256(out + 11, w1); + w0 = _mm256_lddqu_si256(compressed + 12); + _mm256_storeu_si256(out + 12, w0); + w1 = _mm256_lddqu_si256(compressed + 13); + _mm256_storeu_si256(out + 13, w1); + w0 = _mm256_lddqu_si256(compressed + 14); + _mm256_storeu_si256(out + 14, w0); + w1 = _mm256_lddqu_si256(compressed + 15); + _mm256_storeu_si256(out + 15, w1); + w0 = _mm256_lddqu_si256(compressed + 16); + _mm256_storeu_si256(out + 16, w0); + w1 = _mm256_lddqu_si256(compressed + 17); + _mm256_storeu_si256(out + 17, w1); + w0 = _mm256_lddqu_si256(compressed + 18); + _mm256_storeu_si256(out + 18, w0); + w1 = _mm256_lddqu_si256(compressed + 19); + _mm256_storeu_si256(out + 19, w1); + w0 = _mm256_lddqu_si256(compressed + 20); + _mm256_storeu_si256(out + 20, w0); + w1 = _mm256_lddqu_si256(compressed + 21); + _mm256_storeu_si256(out + 21, w1); + w0 = _mm256_lddqu_si256(compressed + 22); + _mm256_storeu_si256(out + 22, w0); + w1 = _mm256_lddqu_si256(compressed + 23); + _mm256_storeu_si256(out + 23, w1); + w0 = _mm256_lddqu_si256(compressed + 24); + _mm256_storeu_si256(out + 24, w0); + w1 = _mm256_lddqu_si256(compressed + 25); + _mm256_storeu_si256(out + 25, w1); + w0 = _mm256_lddqu_si256(compressed + 26); + _mm256_storeu_si256(out + 26, w0); + w1 = _mm256_lddqu_si256(compressed + 27); + _mm256_storeu_si256(out + 27, w1); + w0 = _mm256_lddqu_si256(compressed + 28); + _mm256_storeu_si256(out + 28, w0); + w1 = _mm256_lddqu_si256(compressed + 29); + _mm256_storeu_si256(out + 29, w1); + w0 = _mm256_lddqu_si256(compressed + 30); + _mm256_storeu_si256(out + 30, w0); + w1 = _mm256_lddqu_si256(compressed + 31); + _mm256_storeu_si256(out + 31, w1); +} + +static avxpackblockfnc avxfuncPackArr[] = { + &avxpackblock0, &avxpackblock1, &avxpackblock2, &avxpackblock3, + &avxpackblock4, &avxpackblock5, &avxpackblock6, &avxpackblock7, + &avxpackblock8, &avxpackblock9, &avxpackblock10, &avxpackblock11, + &avxpackblock12, &avxpackblock13, &avxpackblock14, &avxpackblock15, + &avxpackblock16, &avxpackblock17, &avxpackblock18, &avxpackblock19, + &avxpackblock20, &avxpackblock21, &avxpackblock22, &avxpackblock23, + &avxpackblock24, &avxpackblock25, &avxpackblock26, &avxpackblock27, + &avxpackblock28, &avxpackblock29, &avxpackblock30, &avxpackblock31, + &avxpackblock32}; +static avxpackblockfnc avxfuncPackMaskArr[] = { + &avxpackblockmask0, &avxpackblockmask1, &avxpackblockmask2, + &avxpackblockmask3, &avxpackblockmask4, &avxpackblockmask5, + &avxpackblockmask6, &avxpackblockmask7, &avxpackblockmask8, + &avxpackblockmask9, &avxpackblockmask10, &avxpackblockmask11, + &avxpackblockmask12, &avxpackblockmask13, &avxpackblockmask14, + &avxpackblockmask15, &avxpackblockmask16, &avxpackblockmask17, + &avxpackblockmask18, &avxpackblockmask19, &avxpackblockmask20, + &avxpackblockmask21, &avxpackblockmask22, &avxpackblockmask23, + &avxpackblockmask24, &avxpackblockmask25, &avxpackblockmask26, + &avxpackblockmask27, &avxpackblockmask28, &avxpackblockmask29, + &avxpackblockmask30, &avxpackblockmask31, &avxpackblockmask32}; +static avxunpackblockfnc avxfuncUnpackArr[] = { + &avxunpackblock0, &avxunpackblock1, &avxunpackblock2, &avxunpackblock3, + &avxunpackblock4, &avxunpackblock5, &avxunpackblock6, &avxunpackblock7, + &avxunpackblock8, &avxunpackblock9, &avxunpackblock10, &avxunpackblock11, + &avxunpackblock12, &avxunpackblock13, &avxunpackblock14, &avxunpackblock15, + &avxunpackblock16, &avxunpackblock17, &avxunpackblock18, &avxunpackblock19, + &avxunpackblock20, &avxunpackblock21, &avxunpackblock22, &avxunpackblock23, + &avxunpackblock24, &avxunpackblock25, &avxunpackblock26, &avxunpackblock27, + &avxunpackblock28, &avxunpackblock29, &avxunpackblock30, &avxunpackblock31, + &avxunpackblock32}; +/** avxpacking **/ + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpack(const uint32_t *in, __m256i *out, const uint32_t bit) { + avxfuncPackMaskArr[bit](in, out); +} + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpackwithoutmask(const uint32_t *in, __m256i *out, const uint32_t bit) { + avxfuncPackArr[bit](in, out); +} + +/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */ +void avxunpack(const __m256i *in, uint32_t *out, const uint32_t bit) { + avxfuncUnpackArr[bit](in, out); +} + +#endif /* __AVX2__ */ diff --git a/src/simdbitpacking.c b/src/simdbitpacking.c index 9c90e72..717abde 100644 --- a/src/simdbitpacking.c +++ b/src/simdbitpacking.c @@ -3,14005 +3,14397 @@ */ #include "simdbitpacking.h" - -static void SIMD_nullunpacker32(const __m128i * _in , uint32_t * out) { - memset(out,0,32 * 4 * 4); +static void SIMD_nullunpacker32(const __m128i *_in, uint32_t *out) { + (void)_in; + memset(out, 0, 32 * 4 * 4); } -static void __SIMD_fastpackwithoutmask1_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); +static void __SIMD_fastpackwithoutmask1_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); +} +static void __SIMD_fastpackwithoutmask2_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask2_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask3_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask3_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask5_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask5_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask6_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask6_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask7_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask7_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask9_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask9_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask10_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask10_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask11_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); } +static void __SIMD_fastpackwithoutmask12_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastpackwithoutmask11_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask13_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); } +static void __SIMD_fastpackwithoutmask14_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); +} +static void __SIMD_fastpackwithoutmask15_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); +} -static void __SIMD_fastpackwithoutmask12_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); +static void __SIMD_fastpackwithoutmask17_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask18_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask19_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask20_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask21_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask22_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask23_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask24_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask25_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask26_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask27_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask28_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask29_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask30_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask31_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask32_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask4_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for (outer = 0; outer < 4; ++outer) { + InReg = _mm_loadu_si128(in); OutReg = InReg; - InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(in + 4); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_loadu_si128(in + 6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_loadu_si128(in + 7); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); _mm_storeu_si128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + in += 8; + } +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask8_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for (outer = 0; outer < 8; ++outer) { + InReg = _mm_loadu_si128(in); OutReg = InReg; - InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in + 2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(in + 3); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); _mm_storeu_si128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + in += 4; + } +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpackwithoutmask16_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + for (outer = 0; outer < 16; ++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in + 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); _mm_storeu_si128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + in += 2; + } +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); +static void __SIMD_fastpack1_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 1) - 1); -} + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastpackwithoutmask13_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack2_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastpackwithoutmask14_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack3_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastpackwithoutmask15_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack5_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastpackwithoutmask17_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack6_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastpackwithoutmask18_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack7_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask19_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask20_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask21_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask22_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask23_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask24_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask25_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask26_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask27_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask28_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask29_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask30_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask31_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask32_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask4_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg; - - for(uint32_t outer=0; outer< 4 ;++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in+1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - - InReg = _mm_loadu_si128(in+2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_loadu_si128(in+3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - - InReg = _mm_loadu_si128(in+4); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_loadu_si128(in+5); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - - InReg = _mm_loadu_si128(in+6); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - - InReg = _mm_loadu_si128(in+7); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=8; - } - -} - - - -static void __SIMD_fastpackwithoutmask8_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg; - - for(uint32_t outer=0; outer< 8 ;++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in+1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_loadu_si128(in+2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_loadu_si128(in+3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=4; - } - -} - - - -static void __SIMD_fastpackwithoutmask16_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg; - - for(uint32_t outer=0; outer< 16 ;++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in+1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=2; - } - -} - - - -static void __SIMD_fastpack1_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<1)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack2_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<2)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack3_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<3)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack5_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<5)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack6_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<6)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack7_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<7)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack9_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<9)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack10_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<10)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack11_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<11)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack12_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<12)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack13_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<13)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack14_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<14)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack15_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<15)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack17_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<17)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack18_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<18)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack19_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<19)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack20_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<20)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack21_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<21)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack22_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<22)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack23_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<23)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack24_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<24)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack25_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<25)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack26_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<26)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack27_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<27)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack28_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<28)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack29_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<29)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack30_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<30)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack31_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<31)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack32_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack9_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastpack4_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U<<4)-1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for(uint32_t outer=0; outer< 4 ;++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - in+=8; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastpack8_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U<<8)-1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for(uint32_t outer=0; outer< 8 ;++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - in+=4; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); } +static void __SIMD_fastpack10_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); -static void __SIMD_fastpack16_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U<<16)-1); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for(uint32_t outer=0; outer< 16 ;++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - in+=2; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastunpack1_32(const __m128i* in, uint32_t * _out) { - __m128i* out = (__m128i*)(_out); - __m128i InReg1 = _mm_loadu_si128(in); - __m128i InReg2 = InReg1; - __m128i OutReg1, OutReg2, OutReg3, OutReg4; - const __m128i mask = _mm_set1_epi32(1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - unsigned shift = 0; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for (unsigned i = 0; i < 8; ++i) { - OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); - OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); - OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); - OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); - _mm_storeu_si128(out++, OutReg1); - _mm_storeu_si128(out++, OutReg2); - _mm_storeu_si128(out++, OutReg3); - _mm_storeu_si128(out++, OutReg4); - } -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastunpack2_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<2)-1); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack11_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - _mm_storeu_si128(out++, OutReg); + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg,30) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastunpack3_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<3)-1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack12_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg,29) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -static void __SIMD_fastunpack4_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<4)-1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); +} + +static void __SIMD_fastpack13_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack14_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack15_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack17_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack18_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg,28) ; - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack19_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); +} +static void __SIMD_fastpack20_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); +} +static void __SIMD_fastpack21_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); } +static void __SIMD_fastpack22_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); +} +static void __SIMD_fastpack23_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); +} +static void __SIMD_fastpack24_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); +} -static void __SIMD_fastunpack5_32(const __m128i* in, uint32_t * _out) { +static void __SIMD_fastpack25_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<5)-1); +static void __SIMD_fastpack26_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack27_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack28_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack29_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack30_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack31_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack32_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastpack4_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + uint32_t outer; - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); - _mm_storeu_si128(out++, OutReg); + for (outer = 0; outer < 4; ++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + InReg = _mm_and_si128(_mm_loadu_si128(in + 6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + in += 8; + } +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack8_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + uint32_t outer; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + for (outer = 0; outer < 8; ++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + in += 4; + } +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastpack16_32(const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + uint32_t outer; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + for (outer = 0; outer < 16; ++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + in += 2; + } +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack1_32(const __m128i *in, uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + __m128i InReg1 = _mm_loadu_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + uint32_t i, shift = 0; + + for (i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++), mask); + _mm_storeu_si128(out++, OutReg1); + _mm_storeu_si128(out++, OutReg2); + _mm_storeu_si128(out++, OutReg3); + _mm_storeu_si128(out++, OutReg4); + } +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack2_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack6_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<6)-1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack3_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 27), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack7_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<7)-1); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack4_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -} + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack8_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<8)-1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastunpack5_32(const __m128i *in, uint32_t *_out) { - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + _mm_storeu_si128(out++, OutReg); } +static void __SIMD_fastunpack6_32(const __m128i *in, uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack9_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<9)-1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack7_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack10_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<10)-1); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack8_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack11_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<11)-1); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack9_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,21) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack12_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<12)-1); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastunpack10_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastunpack13_32(const __m128i* in, uint32_t * _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<13)-1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack11_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,19) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack14_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_srli_epi32(InReg, 21); + _mm_storeu_si128(out++, OutReg); +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<14)-1); +static void __SIMD_fastunpack12_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg,18) ; - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack13_32(const __m128i *in, uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); -} + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack15_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<15)-1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack14_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,17) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack16_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<16)-1); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + _mm_storeu_si128(out++, OutReg); +} - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack15_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastunpack17_32(const __m128i* in, uint32_t * _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<17)-1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack16_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,15) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack18_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<18)-1); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack17_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack19_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<19)-1); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack18_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,13) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack20_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<20)-1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack19_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack21_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<21)-1); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg, 13); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastunpack20_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,11) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + _mm_storeu_si128(out++, OutReg); } +static void __SIMD_fastunpack21_32(const __m128i *in, uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack22_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<22)-1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 11); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack22_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,10) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastunpack23_32(const __m128i* in, uint32_t * _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<23)-1); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastunpack23_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,9) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack24_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<24)-1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack24_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,8) ; - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack25_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<25)-1); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack25_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,7) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastunpack26_32(const __m128i* in, uint32_t * _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<26)-1); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack26_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,6) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastunpack27_32(const __m128i* in, uint32_t * _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<27)-1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,7) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack27_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,5) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastunpack28_32(const __m128i* in, uint32_t * _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<28)-1); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 5); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack28_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,4) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack29_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<29)-1); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); - _mm_storeu_si128(out++, OutReg); +static void __SIMD_fastunpack29_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,5) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,7) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,3) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + _mm_storeu_si128(out++, OutReg); -static void __SIMD_fastunpack30_32(const __m128i* in, uint32_t * _out) { + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<30)-1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 3); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); +static void __SIMD_fastunpack30_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,2) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,2) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); + + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); -static void __SIMD_fastunpack31_32(const __m128i* in, uint32_t * _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<31)-1); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 2); + _mm_storeu_si128(out++, OutReg); +} + +static void __SIMD_fastunpack31_32(const __m128i *in, uint32_t *_out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,7) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,5) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,3) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,2) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 3); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg,1) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 2); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg, 1); + _mm_storeu_si128(out++, OutReg); } +void __SIMD_fastunpack32_32(const __m128i *in, uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + uint32_t outer; -void __SIMD_fastunpack32_32(const __m128i* in, uint32_t * _out) { - __m128i* out = (__m128i*)(_out); - for(uint32_t outer=0; outer< 32 ;++outer) { + for (outer = 0; outer < 32; ++outer) { _mm_storeu_si128(out++, _mm_loadu_si128(in++)); } } +void simdunpack(const __m128i *in, uint32_t *out, const uint32_t bit) { + switch (bit) { + case 0: + SIMD_nullunpacker32(in, out); + return; + case 1: + __SIMD_fastunpack1_32(in, out); + return; -void simdunpack(const __m128i * in, uint32_t * out, const uint32_t bit) { - switch(bit) { - case 0: SIMD_nullunpacker32(in,out); return; - - case 1: __SIMD_fastunpack1_32(in,out); return; + case 2: + __SIMD_fastunpack2_32(in, out); + return; - case 2: __SIMD_fastunpack2_32(in,out); return; + case 3: + __SIMD_fastunpack3_32(in, out); + return; - case 3: __SIMD_fastunpack3_32(in,out); return; + case 4: + __SIMD_fastunpack4_32(in, out); + return; - case 4: __SIMD_fastunpack4_32(in,out); return; + case 5: + __SIMD_fastunpack5_32(in, out); + return; - case 5: __SIMD_fastunpack5_32(in,out); return; + case 6: + __SIMD_fastunpack6_32(in, out); + return; - case 6: __SIMD_fastunpack6_32(in,out); return; + case 7: + __SIMD_fastunpack7_32(in, out); + return; - case 7: __SIMD_fastunpack7_32(in,out); return; + case 8: + __SIMD_fastunpack8_32(in, out); + return; - case 8: __SIMD_fastunpack8_32(in,out); return; + case 9: + __SIMD_fastunpack9_32(in, out); + return; - case 9: __SIMD_fastunpack9_32(in,out); return; + case 10: + __SIMD_fastunpack10_32(in, out); + return; - case 10: __SIMD_fastunpack10_32(in,out); return; + case 11: + __SIMD_fastunpack11_32(in, out); + return; - case 11: __SIMD_fastunpack11_32(in,out); return; + case 12: + __SIMD_fastunpack12_32(in, out); + return; - case 12: __SIMD_fastunpack12_32(in,out); return; + case 13: + __SIMD_fastunpack13_32(in, out); + return; - case 13: __SIMD_fastunpack13_32(in,out); return; + case 14: + __SIMD_fastunpack14_32(in, out); + return; - case 14: __SIMD_fastunpack14_32(in,out); return; + case 15: + __SIMD_fastunpack15_32(in, out); + return; - case 15: __SIMD_fastunpack15_32(in,out); return; + case 16: + __SIMD_fastunpack16_32(in, out); + return; - case 16: __SIMD_fastunpack16_32(in,out); return; + case 17: + __SIMD_fastunpack17_32(in, out); + return; - case 17: __SIMD_fastunpack17_32(in,out); return; + case 18: + __SIMD_fastunpack18_32(in, out); + return; - case 18: __SIMD_fastunpack18_32(in,out); return; + case 19: + __SIMD_fastunpack19_32(in, out); + return; - case 19: __SIMD_fastunpack19_32(in,out); return; + case 20: + __SIMD_fastunpack20_32(in, out); + return; - case 20: __SIMD_fastunpack20_32(in,out); return; + case 21: + __SIMD_fastunpack21_32(in, out); + return; - case 21: __SIMD_fastunpack21_32(in,out); return; + case 22: + __SIMD_fastunpack22_32(in, out); + return; - case 22: __SIMD_fastunpack22_32(in,out); return; + case 23: + __SIMD_fastunpack23_32(in, out); + return; - case 23: __SIMD_fastunpack23_32(in,out); return; + case 24: + __SIMD_fastunpack24_32(in, out); + return; - case 24: __SIMD_fastunpack24_32(in,out); return; + case 25: + __SIMD_fastunpack25_32(in, out); + return; - case 25: __SIMD_fastunpack25_32(in,out); return; + case 26: + __SIMD_fastunpack26_32(in, out); + return; - case 26: __SIMD_fastunpack26_32(in,out); return; + case 27: + __SIMD_fastunpack27_32(in, out); + return; - case 27: __SIMD_fastunpack27_32(in,out); return; + case 28: + __SIMD_fastunpack28_32(in, out); + return; - case 28: __SIMD_fastunpack28_32(in,out); return; + case 29: + __SIMD_fastunpack29_32(in, out); + return; - case 29: __SIMD_fastunpack29_32(in,out); return; + case 30: + __SIMD_fastunpack30_32(in, out); + return; - case 30: __SIMD_fastunpack30_32(in,out); return; + case 31: + __SIMD_fastunpack31_32(in, out); + return; - case 31: __SIMD_fastunpack31_32(in,out); return; + case 32: + __SIMD_fastunpack32_32(in, out); + return; - case 32: __SIMD_fastunpack32_32(in,out); return; - - default: break; - } + default: + break; + } } +/*assumes that integers fit in the prescribed number of bits*/ +void simdpackwithoutmask(const uint32_t *in, __m128i *out, const uint32_t bit) { + switch (bit) { + case 0: + return; + case 1: + __SIMD_fastpackwithoutmask1_32(in, out); + return; - /*assumes that integers fit in the prescribed number of bits*/ -void simdpackwithoutmask(const uint32_t * in, __m128i * out, const uint32_t bit) { - switch(bit) { - case 0: return; - - case 1: __SIMD_fastpackwithoutmask1_32(in,out); return; + case 2: + __SIMD_fastpackwithoutmask2_32(in, out); + return; - case 2: __SIMD_fastpackwithoutmask2_32(in,out); return; + case 3: + __SIMD_fastpackwithoutmask3_32(in, out); + return; - case 3: __SIMD_fastpackwithoutmask3_32(in,out); return; + case 4: + __SIMD_fastpackwithoutmask4_32(in, out); + return; - case 4: __SIMD_fastpackwithoutmask4_32(in,out); return; + case 5: + __SIMD_fastpackwithoutmask5_32(in, out); + return; - case 5: __SIMD_fastpackwithoutmask5_32(in,out); return; + case 6: + __SIMD_fastpackwithoutmask6_32(in, out); + return; - case 6: __SIMD_fastpackwithoutmask6_32(in,out); return; + case 7: + __SIMD_fastpackwithoutmask7_32(in, out); + return; - case 7: __SIMD_fastpackwithoutmask7_32(in,out); return; + case 8: + __SIMD_fastpackwithoutmask8_32(in, out); + return; - case 8: __SIMD_fastpackwithoutmask8_32(in,out); return; + case 9: + __SIMD_fastpackwithoutmask9_32(in, out); + return; - case 9: __SIMD_fastpackwithoutmask9_32(in,out); return; + case 10: + __SIMD_fastpackwithoutmask10_32(in, out); + return; - case 10: __SIMD_fastpackwithoutmask10_32(in,out); return; + case 11: + __SIMD_fastpackwithoutmask11_32(in, out); + return; - case 11: __SIMD_fastpackwithoutmask11_32(in,out); return; + case 12: + __SIMD_fastpackwithoutmask12_32(in, out); + return; - case 12: __SIMD_fastpackwithoutmask12_32(in,out); return; + case 13: + __SIMD_fastpackwithoutmask13_32(in, out); + return; - case 13: __SIMD_fastpackwithoutmask13_32(in,out); return; + case 14: + __SIMD_fastpackwithoutmask14_32(in, out); + return; - case 14: __SIMD_fastpackwithoutmask14_32(in,out); return; + case 15: + __SIMD_fastpackwithoutmask15_32(in, out); + return; - case 15: __SIMD_fastpackwithoutmask15_32(in,out); return; + case 16: + __SIMD_fastpackwithoutmask16_32(in, out); + return; - case 16: __SIMD_fastpackwithoutmask16_32(in,out); return; + case 17: + __SIMD_fastpackwithoutmask17_32(in, out); + return; - case 17: __SIMD_fastpackwithoutmask17_32(in,out); return; + case 18: + __SIMD_fastpackwithoutmask18_32(in, out); + return; - case 18: __SIMD_fastpackwithoutmask18_32(in,out); return; + case 19: + __SIMD_fastpackwithoutmask19_32(in, out); + return; - case 19: __SIMD_fastpackwithoutmask19_32(in,out); return; + case 20: + __SIMD_fastpackwithoutmask20_32(in, out); + return; - case 20: __SIMD_fastpackwithoutmask20_32(in,out); return; + case 21: + __SIMD_fastpackwithoutmask21_32(in, out); + return; - case 21: __SIMD_fastpackwithoutmask21_32(in,out); return; + case 22: + __SIMD_fastpackwithoutmask22_32(in, out); + return; - case 22: __SIMD_fastpackwithoutmask22_32(in,out); return; + case 23: + __SIMD_fastpackwithoutmask23_32(in, out); + return; - case 23: __SIMD_fastpackwithoutmask23_32(in,out); return; + case 24: + __SIMD_fastpackwithoutmask24_32(in, out); + return; - case 24: __SIMD_fastpackwithoutmask24_32(in,out); return; + case 25: + __SIMD_fastpackwithoutmask25_32(in, out); + return; - case 25: __SIMD_fastpackwithoutmask25_32(in,out); return; + case 26: + __SIMD_fastpackwithoutmask26_32(in, out); + return; - case 26: __SIMD_fastpackwithoutmask26_32(in,out); return; + case 27: + __SIMD_fastpackwithoutmask27_32(in, out); + return; - case 27: __SIMD_fastpackwithoutmask27_32(in,out); return; + case 28: + __SIMD_fastpackwithoutmask28_32(in, out); + return; - case 28: __SIMD_fastpackwithoutmask28_32(in,out); return; + case 29: + __SIMD_fastpackwithoutmask29_32(in, out); + return; - case 29: __SIMD_fastpackwithoutmask29_32(in,out); return; + case 30: + __SIMD_fastpackwithoutmask30_32(in, out); + return; - case 30: __SIMD_fastpackwithoutmask30_32(in,out); return; + case 31: + __SIMD_fastpackwithoutmask31_32(in, out); + return; - case 31: __SIMD_fastpackwithoutmask31_32(in,out); return; + case 32: + __SIMD_fastpackwithoutmask32_32(in, out); + return; - case 32: __SIMD_fastpackwithoutmask32_32(in,out); return; - - default: break; - } + default: + break; + } } +/*assumes that integers fit in the prescribed number of bits*/ +void simdpack(const uint32_t *in, __m128i *out, const uint32_t bit) { + switch (bit) { + case 0: + return; + + case 1: + __SIMD_fastpack1_32(in, out); + return; + case 2: + __SIMD_fastpack2_32(in, out); + return; - /*assumes that integers fit in the prescribed number of bits*/ -void simdpack(const uint32_t * in, __m128i * out, const uint32_t bit) { - switch(bit) { - case 0: return; + case 3: + __SIMD_fastpack3_32(in, out); + return; - case 1: __SIMD_fastpack1_32(in,out); return; + case 4: + __SIMD_fastpack4_32(in, out); + return; - case 2: __SIMD_fastpack2_32(in,out); return; + case 5: + __SIMD_fastpack5_32(in, out); + return; - case 3: __SIMD_fastpack3_32(in,out); return; + case 6: + __SIMD_fastpack6_32(in, out); + return; - case 4: __SIMD_fastpack4_32(in,out); return; + case 7: + __SIMD_fastpack7_32(in, out); + return; - case 5: __SIMD_fastpack5_32(in,out); return; + case 8: + __SIMD_fastpack8_32(in, out); + return; - case 6: __SIMD_fastpack6_32(in,out); return; + case 9: + __SIMD_fastpack9_32(in, out); + return; - case 7: __SIMD_fastpack7_32(in,out); return; + case 10: + __SIMD_fastpack10_32(in, out); + return; - case 8: __SIMD_fastpack8_32(in,out); return; + case 11: + __SIMD_fastpack11_32(in, out); + return; - case 9: __SIMD_fastpack9_32(in,out); return; + case 12: + __SIMD_fastpack12_32(in, out); + return; - case 10: __SIMD_fastpack10_32(in,out); return; + case 13: + __SIMD_fastpack13_32(in, out); + return; - case 11: __SIMD_fastpack11_32(in,out); return; + case 14: + __SIMD_fastpack14_32(in, out); + return; - case 12: __SIMD_fastpack12_32(in,out); return; + case 15: + __SIMD_fastpack15_32(in, out); + return; - case 13: __SIMD_fastpack13_32(in,out); return; + case 16: + __SIMD_fastpack16_32(in, out); + return; - case 14: __SIMD_fastpack14_32(in,out); return; + case 17: + __SIMD_fastpack17_32(in, out); + return; - case 15: __SIMD_fastpack15_32(in,out); return; + case 18: + __SIMD_fastpack18_32(in, out); + return; - case 16: __SIMD_fastpack16_32(in,out); return; + case 19: + __SIMD_fastpack19_32(in, out); + return; - case 17: __SIMD_fastpack17_32(in,out); return; + case 20: + __SIMD_fastpack20_32(in, out); + return; - case 18: __SIMD_fastpack18_32(in,out); return; + case 21: + __SIMD_fastpack21_32(in, out); + return; - case 19: __SIMD_fastpack19_32(in,out); return; + case 22: + __SIMD_fastpack22_32(in, out); + return; - case 20: __SIMD_fastpack20_32(in,out); return; + case 23: + __SIMD_fastpack23_32(in, out); + return; - case 21: __SIMD_fastpack21_32(in,out); return; + case 24: + __SIMD_fastpack24_32(in, out); + return; - case 22: __SIMD_fastpack22_32(in,out); return; + case 25: + __SIMD_fastpack25_32(in, out); + return; - case 23: __SIMD_fastpack23_32(in,out); return; + case 26: + __SIMD_fastpack26_32(in, out); + return; - case 24: __SIMD_fastpack24_32(in,out); return; + case 27: + __SIMD_fastpack27_32(in, out); + return; - case 25: __SIMD_fastpack25_32(in,out); return; + case 28: + __SIMD_fastpack28_32(in, out); + return; - case 26: __SIMD_fastpack26_32(in,out); return; + case 29: + __SIMD_fastpack29_32(in, out); + return; - case 27: __SIMD_fastpack27_32(in,out); return; + case 30: + __SIMD_fastpack30_32(in, out); + return; - case 28: __SIMD_fastpack28_32(in,out); return; + case 31: + __SIMD_fastpack31_32(in, out); + return; - case 29: __SIMD_fastpack29_32(in,out); return; + case 32: + __SIMD_fastpack32_32(in, out); + return; - case 30: __SIMD_fastpack30_32(in,out); return; + default: + break; + } +} - case 31: __SIMD_fastpack31_32(in,out); return; +__m128i *simdpack_shortlength(const uint32_t *in, int length, __m128i *out, + const uint32_t bit) { + int k; + int inwordpointer; + __m128i P; + uint32_t firstpass; + if (bit == 0) + return out; /* nothing to do */ + if (bit == 32) { + memcpy(out, in, length * sizeof(uint32_t)); + return (__m128i *)((uint32_t *)out + length); + } + inwordpointer = 0; + P = _mm_setzero_si128(); + for (k = 0; k < length / 4; ++k) { + __m128i value = _mm_loadu_si128(((const __m128i *)in + k)); + P = _mm_or_si128(P, _mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + _mm_storeu_si128(out++, P); + P = _mm_srli_epi32(value, firstpass); + inwordpointer = bit - firstpass; + } + } + if (length % 4 != 0) { + uint32_t buffer[4]; + __m128i value; + for (k = 0; k < (length % 4); ++k) { + buffer[k] = in[length / 4 * 4 + k]; + } + for (k = (length % 4); k < 4; ++k) { + buffer[k] = 0; + } + value = _mm_loadu_si128((__m128i *)buffer); + P = _mm_or_si128(P, _mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + _mm_storeu_si128(out++, P); + P = _mm_srli_epi32(value, firstpass); + inwordpointer = bit - firstpass; + } + } + if (inwordpointer != 0) { + _mm_storeu_si128(out++, P); + } + return out; +} - case 32: __SIMD_fastpack32_32(in,out); return; +const __m128i *simdunpack_shortlength(const __m128i *in, int length, + uint32_t *out, const uint32_t bit) { + int k; + __m128i maskbits; + int inwordpointer; + __m128i P; + if (length == 0) + return in; + if (bit == 0) { + for (k = 0; k < length; ++k) { + out[k] = 0; + } + return in; + } + if (bit == 32) { + memcpy(out, in, length * sizeof(uint32_t)); + return (const __m128i *)((uint32_t *)in + length); + } + maskbits = _mm_set1_epi32((1U << bit) - 1); + inwordpointer = 0; + P = _mm_loadu_si128((__m128i *)in); + ++in; + if (length % 4 == 0) { + + for (k = 0; k + 1 < length / 4; ++k) { + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)out, answer); + out += 4; + } + if (k < length / 4) { + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else if (bit == firstpass) { + inwordpointer = 0; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)out, answer); + out += 4; + } - default: break; + } else { + for (k = 0; k < length / 4; ++k) { + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)out, answer); + out += 4; } + uint32_t buffer[4]; + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else if (bit == firstpass) { + inwordpointer = 0; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)buffer, answer); + for (k = 0; k < (length % 4); ++k) { + *out = buffer[k]; + ++out; + } + } + return in; +} + +void simdfastset(__m128i *in128, uint32_t b, uint32_t value, size_t index) { + uint32_t *in = (uint32_t *)in128; + const int lane = index % 4; /* we have 4 interleaved lanes */ + const int bitsinlane = (index / 4) * b; /* how many bits in lane */ + const int firstwordinlane = bitsinlane / 32; + const int secondwordinlane = (bitsinlane + b - 1) / 32; + const uint32_t mask = (1U << b) - 1; + if (b == 0) + return; + /* we zero */ + if (b == 32) + in[4 * firstwordinlane + lane] = 0; + else + in[4 * firstwordinlane + lane] &= ~(mask << (bitsinlane % 32)); + + /* we write */ + in[4 * firstwordinlane + lane] |= (value << (bitsinlane % 32)); + + if (firstwordinlane == secondwordinlane) { + /* easy common case*/ + return; + } else { + /* harder case where we need to combine two words */ + const int firstbits = 32 - (bitsinlane % 32); + const int usablebits = b - firstbits; + const uint32_t mask2 = (1U << usablebits) - 1; + in[4 * firstwordinlane + 4 + lane] &= ~mask2; /* we zero */ + in[4 * firstwordinlane + 4 + lane] |= value >> firstbits; /* we write */ + return; + } } +int simdpack_compressedbytes(int length, const uint32_t bit) { + if (bit == 0) + return 0; /* nothing to do */ + if (bit == 32) { + return length * sizeof(uint32_t); + } + return (((length + 3) / 4) * bit + 31) / 32 * sizeof(__m128i); +} +__m128i *simdpack_length(const uint32_t *in, size_t length, __m128i *out, + const uint32_t bit) { + size_t k; + for (k = 0; k < length / SIMDBlockSize; ++k) { + simdpack(in, out, bit); + in += SIMDBlockSize; + out += bit; + } + return simdpack_shortlength(in, length % SIMDBlockSize, out, bit); +} +const __m128i *simdunpack_length(const __m128i *in, size_t length, + uint32_t *out, const uint32_t bit) { + size_t k; + for (k = 0; k < length / SIMDBlockSize; ++k) { + simdunpack(in, out, bit); + out += SIMDBlockSize; + in += bit; + } + return simdunpack_shortlength(in, length % SIMDBlockSize, out, bit); +} diff --git a/src/simdcomputil.c b/src/simdcomputil.c index c597aff..4090b03 100644 --- a/src/simdcomputil.c +++ b/src/simdcomputil.c @@ -1,55 +1,248 @@ -#include "simdcomputil.h" +/** + * This code is released under a BSD License. + */ -__attribute__((always_inline)) -static inline __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, - _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); -} +#include "simdcomputil.h" +#ifdef __SSE4_1__ +#include +#endif +#include +#define Delta(curr, prev) \ + _mm_sub_epi32( \ + curr, _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))) -// returns the integer logarithm of v (bit width) +/* returns the integer logarithm of v (bit width) */ uint32_t bits(const uint32_t v) { #ifdef _MSC_VER - if (v == 0) { - return 0; - } - unsigned long answer; - _BitScanReverse(&answer, v); - return answer + 1; + unsigned long answer; + if (v == 0) { + return 0; + } + _BitScanReverse(&answer, v); + return answer + 1; #else - return v == 0 ? 0 : 32 - __builtin_clz(v); // assume GCC-like compiler if not microsoft + return v == 0 ? 0 + : 32 - __builtin_clz( + v); /* assume GCC-like compiler if not microsoft */ #endif } -__attribute__ ((pure)) -uint32_t maxbits(const uint32_t * begin) { - uint32_t accumulator = 0; - for (const uint32_t * k = begin; k != begin + SIMDBlockSize; ++k) { - accumulator |= *k; +static uint32_t maxbitas32int(const __m128i accumulator) { + const __m128i _tmp1 = _mm_or_si128( + _mm_srli_si128(accumulator, 8), + accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = + _mm_or_si128(_mm_srli_si128(_tmp1, 4), + _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + uint32_t ans = _mm_cvtsi128_si32(_tmp2); + return bits(ans); +} + +SIMDCOMP_PURE uint32_t maxbits(const uint32_t *begin) { + const __m128i *pin = (const __m128i *)(begin); + __m128i accumulator = _mm_loadu_si128(pin); + uint32_t k = 1; + for (; 4 * k < SIMDBlockSize; ++k) { + __m128i newvec = _mm_loadu_si128(pin + k); + accumulator = _mm_or_si128(accumulator, newvec); + } + return maxbitas32int(accumulator); +} +static uint32_t orasint(const __m128i accumulator) { + const __m128i _tmp1 = _mm_or_si128( + _mm_srli_si128(accumulator, 8), + accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = + _mm_or_si128(_mm_srli_si128(_tmp1, 4), + _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); +} + +#ifdef __SSE4_1__ + +static uint32_t minasint(const __m128i accumulator) { + const __m128i _tmp1 = _mm_min_epu32( + _mm_srli_si128(accumulator, 8), + accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = + _mm_min_epu32(_mm_srli_si128(_tmp1, 4), + _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); +} + +static uint32_t maxasint(const __m128i accumulator) { + const __m128i _tmp1 = _mm_max_epu32( + _mm_srli_si128(accumulator, 8), + accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = + _mm_max_epu32(_mm_srli_si128(_tmp1, 4), + _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); +} + +uint32_t simdmin(const uint32_t *in) { + const __m128i *pin = (const __m128i *)(in); + __m128i accumulator = _mm_loadu_si128(pin); + uint32_t k = 1; + for (; 4 * k < SIMDBlockSize; ++k) { + __m128i newvec = _mm_loadu_si128(pin + k); + accumulator = _mm_min_epu32(accumulator, newvec); + } + return minasint(accumulator); +} + +void simdmaxmin(const uint32_t *in, uint32_t *getmin, uint32_t *getmax) { + const __m128i *pin = (const __m128i *)(in); + __m128i minaccumulator = _mm_loadu_si128(pin); + __m128i maxaccumulator = minaccumulator; + uint32_t k = 1; + for (; 4 * k < SIMDBlockSize; ++k) { + __m128i newvec = _mm_loadu_si128(pin + k); + minaccumulator = _mm_min_epu32(minaccumulator, newvec); + maxaccumulator = _mm_max_epu32(maxaccumulator, newvec); + } + *getmin = minasint(minaccumulator); + *getmax = maxasint(maxaccumulator); +} + +uint32_t simdmin_length(const uint32_t *in, uint32_t length) { + uint32_t currentmin = 0xFFFFFFFF; + uint32_t lengthdividedby4 = length / 4; + uint32_t offset = lengthdividedby4 * 4; + uint32_t k; + if (lengthdividedby4 > 0) { + const __m128i *pin = (const __m128i *)(in); + __m128i accumulator = _mm_loadu_si128(pin); + k = 1; + for (; 4 * k < lengthdividedby4 * 4; ++k) { + __m128i newvec = _mm_loadu_si128(pin + k); + accumulator = _mm_min_epu32(accumulator, newvec); } - return bits(accumulator); + currentmin = minasint(accumulator); + } + for (k = offset; k < length; ++k) + if (in[k] < currentmin) + currentmin = in[k]; + return currentmin; } -static uint32_t maxbitas32int(const __m128i accumulator) { - uint32_t tmparray[4]; - _mm_storeu_si128((__m128i *) (tmparray), accumulator); - return bits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); -} - - -// maxbit over 128 integers (SIMDBlockSize) with provided initial value -uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) { - __m128i initoffset = _mm_set1_epi32 (initvalue); - const __m128i* pin = (const __m128i*)(in); - __m128i newvec = _mm_load_si128(pin); - __m128i accumulator = Delta(newvec , initoffset); - __m128i oldvec = newvec; - for(uint32_t k = 1; 4*k < SIMDBlockSize; ++k) { - newvec = _mm_load_si128(pin+k); - accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec)); - oldvec = newvec; +void simdmaxmin_length(const uint32_t *in, uint32_t length, uint32_t *getmin, + uint32_t *getmax) { + uint32_t lengthdividedby4 = length / 4; + uint32_t offset = lengthdividedby4 * 4; + uint32_t k; + *getmin = 0xFFFFFFFF; + *getmax = 0; + if (lengthdividedby4 > 0) { + const __m128i *pin = (const __m128i *)(in); + __m128i minaccumulator = _mm_loadu_si128(pin); + __m128i maxaccumulator = minaccumulator; + k = 1; + for (; 4 * k < lengthdividedby4 * 4; ++k) { + __m128i newvec = _mm_loadu_si128(pin + k); + minaccumulator = _mm_min_epu32(minaccumulator, newvec); + maxaccumulator = _mm_max_epu32(maxaccumulator, newvec); } - initoffset = oldvec; - return maxbitas32int(accumulator); + *getmin = minasint(minaccumulator); + *getmax = maxasint(maxaccumulator); + } + for (k = offset; k < length; ++k) { + if (in[k] < *getmin) + *getmin = in[k]; + if (in[k] > *getmax) + *getmax = in[k]; + } } +#endif + +SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t *in, uint32_t length) { + uint32_t k; + uint32_t lengthdividedby4 = length / 4; + uint32_t offset = lengthdividedby4 * 4; + uint32_t bigxor = 0; + if (lengthdividedby4 > 0) { + const __m128i *pin = (const __m128i *)(in); + __m128i accumulator = _mm_loadu_si128(pin); + k = 1; + for (; 4 * k < 4 * lengthdividedby4; ++k) { + __m128i newvec = _mm_loadu_si128(pin + k); + accumulator = _mm_or_si128(accumulator, newvec); + } + bigxor = orasint(accumulator); + } + for (k = offset; k < length; ++k) + bigxor |= in[k]; + return bits(bigxor); +} + +/* maxbit over 128 integers (SIMDBlockSize) with provided initial value */ +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t *in) { + __m128i initoffset = _mm_set1_epi32(initvalue); + const __m128i *pin = (const __m128i *)(in); + __m128i newvec = _mm_loadu_si128(pin); + __m128i accumulator = Delta(newvec, initoffset); + __m128i oldvec = newvec; + uint32_t k = 1; + for (; 4 * k < SIMDBlockSize; ++k) { + newvec = _mm_loadu_si128(pin + k); + accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec)); + oldvec = newvec; + } + initoffset = oldvec; + return maxbitas32int(accumulator); +} + +/* maxbit over |length| integers with provided initial value */ +uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t *in, + uint32_t length) { + __m128i newvec; + __m128i oldvec; + __m128i initoffset; + __m128i accumulator; + const __m128i *pin; + uint32_t tmparray[4]; + uint32_t k = 1; + uint32_t acc; + + assert(length > 0); + + pin = (const __m128i *)(in); + initoffset = _mm_set1_epi32(initvalue); + switch (length) { + case 1: + newvec = _mm_set1_epi32(in[0]); + break; + case 2: + newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]); + break; + case 3: + newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]); + break; + default: + newvec = _mm_loadu_si128(pin); + break; + } + accumulator = Delta(newvec, initoffset); + oldvec = newvec; + + /* process 4 integers and build an accumulator */ + while (k * 4 + 4 <= length) { + newvec = _mm_loadu_si128(pin + k); + accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec)); + oldvec = newvec; + k++; + } + + /* extract the accumulator as an integer */ + _mm_storeu_si128((__m128i *)(tmparray), accumulator); + acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]; + + /* now process the remaining integers */ + for (k *= 4; k < length; k++) + acc |= in[k] - (k == 0 ? initvalue : in[k - 1]); + + /* return the number of bits */ + return bits(acc); +} diff --git a/src/simdfor.c b/src/simdfor.c new file mode 100644 index 0000000..abd0955 --- /dev/null +++ b/src/simdfor.c @@ -0,0 +1,15200 @@ +/** + * This code is released under a BSD License. + */ + +#include "simdfor.h" + +static __m128i iunpackFOR0(__m128i initOffset, const __m128i *_in, + uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + int i; + (void)_in; + for (i = 0; i < 8; ++i) { + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + } + + return initOffset; +} + +static void ipackFOR0(__m128i initOffset, const uint32_t *_in, __m128i *out) { + (void)initOffset; + (void)_in; + (void)out; +} + +static void ipackFOR1(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR2(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR3(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR4(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR5(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR6(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR7(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR8(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR9(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR10(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR11(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR12(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR13(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR14(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR15(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR16(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR17(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR18(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR19(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR20(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR21(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR22(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR23(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR24(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR25(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR26(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR27(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR28(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR29(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR30(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR31(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_store_si128(out, OutReg); +} + +static void ipackFOR32(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i InReg = _mm_load_si128(in); + (void)initOffset; + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); +} + +static __m128i iunpackFOR1(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 1) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR2(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR3(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR4(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR5(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR6(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR7(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR8(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR9(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR10(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR11(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR12(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR13(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR14(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR15(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR16(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR17(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR18(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR19(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR20(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR21(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR22(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR23(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR24(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR25(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR26(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR27(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR28(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR29(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR30(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR31(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = _mm_load_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + return initOffset; +} + +static __m128i iunpackFOR32(__m128i initvalue, const __m128i *in, + uint32_t *_out) { + __m128i *mout = (__m128i *)_out; + __m128i invec; + size_t k; + (void)initvalue; + for (k = 0; k < 128 / 4; ++k) { + invec = _mm_load_si128(in++); + _mm_store_si128(mout++, invec); + } + return invec; +} + +void simdpackFOR(uint32_t initvalue, const uint32_t *in, __m128i *out, + const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + ipackFOR0(initOffset, in, out); + break; + + case 1: + ipackFOR1(initOffset, in, out); + break; + + case 2: + ipackFOR2(initOffset, in, out); + break; + + case 3: + ipackFOR3(initOffset, in, out); + break; + + case 4: + ipackFOR4(initOffset, in, out); + break; + + case 5: + ipackFOR5(initOffset, in, out); + break; + + case 6: + ipackFOR6(initOffset, in, out); + break; + + case 7: + ipackFOR7(initOffset, in, out); + break; + + case 8: + ipackFOR8(initOffset, in, out); + break; + + case 9: + ipackFOR9(initOffset, in, out); + break; + + case 10: + ipackFOR10(initOffset, in, out); + break; + + case 11: + ipackFOR11(initOffset, in, out); + break; + + case 12: + ipackFOR12(initOffset, in, out); + break; + + case 13: + ipackFOR13(initOffset, in, out); + break; + + case 14: + ipackFOR14(initOffset, in, out); + break; + + case 15: + ipackFOR15(initOffset, in, out); + break; + + case 16: + ipackFOR16(initOffset, in, out); + break; + + case 17: + ipackFOR17(initOffset, in, out); + break; + + case 18: + ipackFOR18(initOffset, in, out); + break; + + case 19: + ipackFOR19(initOffset, in, out); + break; + + case 20: + ipackFOR20(initOffset, in, out); + break; + + case 21: + ipackFOR21(initOffset, in, out); + break; + + case 22: + ipackFOR22(initOffset, in, out); + break; + + case 23: + ipackFOR23(initOffset, in, out); + break; + + case 24: + ipackFOR24(initOffset, in, out); + break; + + case 25: + ipackFOR25(initOffset, in, out); + break; + + case 26: + ipackFOR26(initOffset, in, out); + break; + + case 27: + ipackFOR27(initOffset, in, out); + break; + + case 28: + ipackFOR28(initOffset, in, out); + break; + + case 29: + ipackFOR29(initOffset, in, out); + break; + + case 30: + ipackFOR30(initOffset, in, out); + break; + + case 31: + ipackFOR31(initOffset, in, out); + break; + + case 32: + ipackFOR32(initOffset, in, out); + break; + + default: + break; + } +} + +void simdunpackFOR(uint32_t initvalue, const __m128i *in, uint32_t *out, + const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + iunpackFOR0(initOffset, in, out); + break; + + case 1: + iunpackFOR1(initOffset, in, out); + break; + + case 2: + iunpackFOR2(initOffset, in, out); + break; + + case 3: + iunpackFOR3(initOffset, in, out); + break; + + case 4: + iunpackFOR4(initOffset, in, out); + break; + + case 5: + iunpackFOR5(initOffset, in, out); + break; + + case 6: + iunpackFOR6(initOffset, in, out); + break; + + case 7: + iunpackFOR7(initOffset, in, out); + break; + + case 8: + iunpackFOR8(initOffset, in, out); + break; + + case 9: + iunpackFOR9(initOffset, in, out); + break; + + case 10: + iunpackFOR10(initOffset, in, out); + break; + + case 11: + iunpackFOR11(initOffset, in, out); + break; + + case 12: + iunpackFOR12(initOffset, in, out); + break; + + case 13: + iunpackFOR13(initOffset, in, out); + break; + + case 14: + iunpackFOR14(initOffset, in, out); + break; + + case 15: + iunpackFOR15(initOffset, in, out); + break; + + case 16: + iunpackFOR16(initOffset, in, out); + break; + + case 17: + iunpackFOR17(initOffset, in, out); + break; + + case 18: + iunpackFOR18(initOffset, in, out); + break; + + case 19: + iunpackFOR19(initOffset, in, out); + break; + + case 20: + iunpackFOR20(initOffset, in, out); + break; + + case 21: + iunpackFOR21(initOffset, in, out); + break; + + case 22: + iunpackFOR22(initOffset, in, out); + break; + + case 23: + iunpackFOR23(initOffset, in, out); + break; + + case 24: + iunpackFOR24(initOffset, in, out); + break; + + case 25: + iunpackFOR25(initOffset, in, out); + break; + + case 26: + iunpackFOR26(initOffset, in, out); + break; + + case 27: + iunpackFOR27(initOffset, in, out); + break; + + case 28: + iunpackFOR28(initOffset, in, out); + break; + + case 29: + iunpackFOR29(initOffset, in, out); + break; + + case 30: + iunpackFOR30(initOffset, in, out); + break; + + case 31: + iunpackFOR31(initOffset, in, out); + break; + + case 32: + iunpackFOR32(initOffset, in, out); + break; + + default: + break; + } +} + +uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int slot) { + const uint32_t *pin = (const uint32_t *)in; + if (bit == 0) { + return initvalue; + } else if (bit == 32) { + /* silly special case */ + return pin[slot]; + } else { + const int lane = slot % 4; /* we have 4 interleaved lanes */ + const int bitsinlane = (slot / 4) * bit; /* how many bits in lane */ + const int firstwordinlane = bitsinlane / 32; + const int secondwordinlane = (bitsinlane + bit - 1) / 32; + const uint32_t firstpart = + pin[4 * firstwordinlane + lane] >> (bitsinlane % 32); + const uint32_t mask = (1U << bit) - 1; + if (firstwordinlane == secondwordinlane) { + /* easy common case*/ + return initvalue + (firstpart & mask); + } else { + /* harder case where we need to combine two words */ + const uint32_t secondpart = pin[4 * firstwordinlane + 4 + lane]; + const int usablebitsinfirstword = 32 - (bitsinlane % 32); + return initvalue + + ((firstpart | (secondpart << usablebitsinfirstword)) & mask); + } + } +} + +int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult) { + int count = length; + int begin = 0; + uint32_t val; + while (count > 0) { + int step = count / 2; + val = simdselectFOR(initvalue, in, bit, begin + step); + if (val < key) { + begin += step + 1; + count -= step + 1; + } else + count = step; + } + *presult = simdselectFOR(initvalue, in, bit, begin); + return begin; +} + +int simdpackFOR_compressedbytes(int length, const uint32_t bit) { + if (bit == 0) + return 0; /* nothing to do */ + if (bit == 32) { + return length * sizeof(uint32_t); + } + return (((length + 3) / 4) * bit + 31) / 32 * sizeof(__m128i); +} + +__m128i *simdpackFOR_length(uint32_t initvalue, const uint32_t *in, int length, + __m128i *out, const uint32_t bit) { + int k; + int inwordpointer; + __m128i P; + uint32_t firstpass; + __m128i offset; + if (bit == 0) + return out; /* nothing to do */ + if (bit == 32) { + memcpy(out, in, length * sizeof(uint32_t)); + return (__m128i *)((uint32_t *)out + length); + } + offset = _mm_set1_epi32(initvalue); + inwordpointer = 0; + P = _mm_setzero_si128(); + for (k = 0; k < length / 4; ++k) { + __m128i value = + _mm_sub_epi32(_mm_loadu_si128(((const __m128i *)in + k)), offset); + P = _mm_or_si128(P, _mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + _mm_storeu_si128(out++, P); + P = _mm_srli_epi32(value, firstpass); + inwordpointer = bit - firstpass; + } + } + if (length % 4 != 0) { + uint32_t buffer[4]; + __m128i value; + for (k = 0; k < (length % 4); ++k) { + buffer[k] = in[length / 4 * 4 + k]; + } + for (k = (length % 4); k < 4; ++k) { + buffer[k] = initvalue; + } + value = _mm_sub_epi32(_mm_loadu_si128((__m128i *)buffer), offset); + P = _mm_or_si128(P, _mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + _mm_storeu_si128(out++, P); + P = _mm_srli_epi32(value, firstpass); + inwordpointer = bit - firstpass; + } + } + if (inwordpointer != 0) { + _mm_storeu_si128(out++, P); + } + return out; +} + +const __m128i *simdunpackFOR_length(uint32_t initvalue, const __m128i *in, + int length, uint32_t *out, + const uint32_t bit) { + int k; + __m128i maskbits; + int inwordpointer; + __m128i P; + __m128i offset; + if (length == 0) + return in; + if (bit == 0) { + for (k = 0; k < length; ++k) { + out[k] = initvalue; + } + return in; + } + if (bit == 32) { + memcpy(out, in, length * sizeof(uint32_t)); + return (const __m128i *)((const uint32_t *)in + length); + } + offset = _mm_set1_epi32(initvalue); + maskbits = _mm_set1_epi32((1U << bit) - 1); + inwordpointer = 0; + P = _mm_loadu_si128((__m128i *)in); + ++in; + if (length % 4 == 0) { + + for (k = 0; k + 1 < length / 4; ++k) { + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)out, _mm_add_epi32(answer, offset)); + out += 4; + } + if (k < length / 4) { + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else if (bit == firstpass) { + inwordpointer = 0; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)out, _mm_add_epi32(answer, offset)); + out += 4; + } + + } else { + for (k = 0; k < length / 4; ++k) { + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)out, _mm_add_epi32(answer, offset)); + out += 4; + } + uint32_t buffer[4]; + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else if (bit == firstpass) { + inwordpointer = 0; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)buffer, _mm_add_epi32(answer, offset)); + for (k = 0; k < (length % 4); ++k) { + *out = buffer[k]; + ++out; + } + } + return in; +} + +void simdfastsetFOR(uint32_t initvalue, __m128i *in, uint32_t bit, + uint32_t value, size_t index) { + simdfastset(in, bit, value - initvalue, index); +} diff --git a/src/simdintegratedbitpacking.c b/src/simdintegratedbitpacking.c index ffe6927..4d43936 100644 --- a/src/simdintegratedbitpacking.c +++ b/src/simdintegratedbitpacking.c @@ -3,24860 +3,25355 @@ */ #include "simdintegratedbitpacking.h" -__attribute__((always_inline)) -static inline __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, - _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); +#if defined(__SSSE3__) || defined(__AVX__) +#define Delta(curr, prev) _mm_sub_epi32(curr, _mm_alignr_epi8(curr, prev, 12)) +#else +#define Delta(curr, prev) \ + _mm_sub_epi32( \ + curr, _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))) +#endif + +#define PrefixSum(ret, curr, prev) \ + do { \ + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ + ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ + } while (0) + +__m128i iunpack0(__m128i initOffset, const __m128i *_in, uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + const __m128i constant = _mm_shuffle_epi32(initOffset, 0xff); + uint32_t i; + (void)_in; + + for (i = 0; i < 8; ++i) { + _mm_storeu_si128(out++, constant); + _mm_storeu_si128(out++, constant); + _mm_storeu_si128(out++, constant); + _mm_storeu_si128(out++, constant); + } + + return initOffset; } -__attribute__((always_inline)) -static inline __m128i PrefixSum(__m128i curr, __m128i prev) { - const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); - const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); - return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); +void ipackwithoutmask0(__m128i initOffset, const uint32_t *_in, __m128i *out) { + (void)initOffset; + (void)_in; + (void)out; } - -__m128i iunpack0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { - __m128i *out = (__m128i*)(_out); - const __m128i zero = _mm_set1_epi32 (0); - - for (unsigned i = 0; i < 8; ++i) { - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - } - - return initOffset; +void ipack0(__m128i initOffset, const uint32_t *_in, __m128i *out) { + (void)initOffset; + (void)_in; + (void)out; } - - - -void ipackwithoutmask0(__m128i initOffset , const uint32_t * _in , __m128i * out) { - +void ipackwithoutmask1(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); } - -void ipack0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { +void ipack1(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(1U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); } - - -void ipackwithoutmask1(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask2(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack1(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - +void ipack2(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(3U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask2(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask3(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack2(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(3U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - +void ipack3(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(7U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask3(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask4(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack3(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(7U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - +void ipack4(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(15U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask4(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask5(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack4(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(15U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - +void ipack5(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(31U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask5(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask6(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack5(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(31U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - +void ipack6(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(63U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask6(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask7(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack6(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(63U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - +void ipack7(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(127U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask7(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask8(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack7(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(127U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - +void ipack8(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(255U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask8(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask9(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack8(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(255U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - +void ipack9(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(511U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask9(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask10(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack9(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(511U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - +void ipack10(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(1023U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask10(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask11(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack10(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1023U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - +void ipack11(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(2047U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask11(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask12(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack11(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2047U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - +void ipack12(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(4095U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask12(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask13(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack12(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(4095U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - +void ipack13(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(8191U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask13(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask14(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack13(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(8191U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - +void ipack14(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(16383U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask14(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask15(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack14(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(16383U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - +void ipack15(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(32767U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask15(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask16(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack15(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(32767U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - +void ipack16(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(65535U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask16(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask17(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack16(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(65535U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - +void ipack17(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(131071U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); } - - - -void ipackwithoutmask17(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - +void ipackwithoutmask18(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); } - - - -void ipack17(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(131071U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - +void ipack18(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(262143U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); } +void ipackwithoutmask19(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); +} +void ipack19(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(524287U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask20(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); +} -void ipackwithoutmask18(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); +void ipack20(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(1048575U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask21(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); +} +void ipack21(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(2097151U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); } +void ipackwithoutmask22(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); +} +void ipack22(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(4194303U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask23(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); +} -void ipack18(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(262143U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); +void ipack23(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(8388607U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask24(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); +} +void ipack24(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(16777215U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); } +void ipackwithoutmask25(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); +} +void ipack25(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(33554431U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask26(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); +} -void ipackwithoutmask19(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); +void ipack26(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(67108863U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask27(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); +} +void ipack27(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(134217727U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); } +void ipackwithoutmask28(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); +} +void ipack28(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(268435455U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask29(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); +} -void ipack19(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(524287U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); +void ipack29(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(536870911U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask30(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); +} +void ipack30(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(1073741823U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); } +void ipackwithoutmask31(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); +} +void ipack31(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, CurrIn, InReg; + + const __m128i mask = _mm_set1_epi32(2147483647U); + ; + + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); +} +void ipackwithoutmask32(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, InReg; + (void)initOffset; -void ipackwithoutmask20(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack20(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1048575U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask21(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack21(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2097151U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask22(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack22(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(4194303U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask23(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack23(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(8388607U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask24(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack24(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(16777215U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask25(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); } +void ipack32(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg, InReg; + (void)initOffset; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack25(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(33554431U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask26(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack26(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(67108863U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask27(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack27(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(134217727U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask28(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack28(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(268435455U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask29(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack29(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(536870911U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipackwithoutmask30(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -void ipack30(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1073741823U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); } +__m128i iunpack1(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 1) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack2(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 2) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack3(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 3) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} -void ipackwithoutmask31(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); +__m128i iunpack4(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 4) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack5(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 5) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack6(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 6) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; } +__m128i iunpack7(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 7) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack8(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 8) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack9(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 9) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} -void ipack31(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2147483647U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); +__m128i iunpack10(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 10) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack11(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 11) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack12(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 12) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; } +__m128i iunpack13(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 13) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack14(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack15(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} -void ipackwithoutmask32(__m128i initOffset , const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; +__m128i iunpack16(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 16) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +__m128i iunpack17(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack18(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +__m128i iunpack19(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack20(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +__m128i iunpack21(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack22(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +__m128i iunpack23(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack24(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +__m128i iunpack25(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack26(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +__m128i iunpack27(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack28(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +__m128i iunpack29(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack30(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +__m128i iunpack31(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = tmp; + PrefixSum(OutReg, OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +__m128i iunpack32(__m128i initOffset, const __m128i *in, uint32_t *_out) { + __m128i *mout = (__m128i *)(_out); + __m128i invec; + size_t k; + (void)initOffset; + for (k = 0; k < 128 / 4; ++k) { + invec = _mm_loadu_si128(in++); + _mm_storeu_si128(mout++, invec); + } + return invec; +} - ++out; - ++in; - InReg = _mm_loadu_si128(in); +void simdunpackd1(uint32_t initvalue, const __m128i *in, uint32_t *out, + const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + iunpack0(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 1: + iunpack1(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 2: + iunpack2(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 3: + iunpack3(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 4: + iunpack4(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 5: + iunpack5(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 6: + iunpack6(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 7: + iunpack7(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 8: + iunpack8(initOffset, in, out); + break; + + case 9: + iunpack9(initOffset, in, out); + break; + + case 10: + iunpack10(initOffset, in, out); + break; + + case 11: + iunpack11(initOffset, in, out); + break; + + case 12: + iunpack12(initOffset, in, out); + break; + + case 13: + iunpack13(initOffset, in, out); + break; + + case 14: + iunpack14(initOffset, in, out); + break; + + case 15: + iunpack15(initOffset, in, out); + break; + + case 16: + iunpack16(initOffset, in, out); + break; + + case 17: + iunpack17(initOffset, in, out); + break; + + case 18: + iunpack18(initOffset, in, out); + break; + + case 19: + iunpack19(initOffset, in, out); + break; + + case 20: + iunpack20(initOffset, in, out); + break; + + case 21: + iunpack21(initOffset, in, out); + break; + + case 22: + iunpack22(initOffset, in, out); + break; + + case 23: + iunpack23(initOffset, in, out); + break; + + case 24: + iunpack24(initOffset, in, out); + break; + + case 25: + iunpack25(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 26: + iunpack26(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 27: + iunpack27(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 28: + iunpack28(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 29: + iunpack29(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 30: + iunpack30(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 31: + iunpack31(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 32: + iunpack32(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + default: + break; + } +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); +/*assumes that integers fit in the prescribed number of bits*/ - ++out; - ++in; - InReg = _mm_loadu_si128(in); +void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t *in, __m128i *out, + const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 1: + ipackwithoutmask1(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 2: + ipackwithoutmask2(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 3: + ipackwithoutmask3(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 4: + ipackwithoutmask4(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 5: + ipackwithoutmask5(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 6: + ipackwithoutmask6(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 7: + ipackwithoutmask7(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 8: + ipackwithoutmask8(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 9: + ipackwithoutmask9(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 10: + ipackwithoutmask10(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 11: + ipackwithoutmask11(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 12: + ipackwithoutmask12(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 13: + ipackwithoutmask13(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 14: + ipackwithoutmask14(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 15: + ipackwithoutmask15(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 16: + ipackwithoutmask16(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 17: + ipackwithoutmask17(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 18: + ipackwithoutmask18(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 19: + ipackwithoutmask19(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 20: + ipackwithoutmask20(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 21: + ipackwithoutmask21(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 22: + ipackwithoutmask22(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 23: + ipackwithoutmask23(initOffset, in, out); + break; + + case 24: + ipackwithoutmask24(initOffset, in, out); + break; + + case 25: + ipackwithoutmask25(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 26: + ipackwithoutmask26(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 27: + ipackwithoutmask27(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 28: + ipackwithoutmask28(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 29: + ipackwithoutmask29(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 30: + ipackwithoutmask30(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 31: + ipackwithoutmask31(initOffset, in, out); + break; + case 32: + ipackwithoutmask32(initOffset, in, out); + break; + default: + break; + } } +void simdpackd1(uint32_t initvalue, const uint32_t *in, __m128i *out, + const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + break; + ; + case 1: + ipack1(initOffset, in, out); + break; + case 2: + ipack2(initOffset, in, out); + break; -void ipack32(__m128i initOffset , const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; + case 3: + ipack3(initOffset, in, out); + break; + case 4: + ipack4(initOffset, in, out); + break; + case 5: + ipack5(initOffset, in, out); + break; - __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 6: + ipack6(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 7: + ipack7(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 8: + ipack8(initOffset, in, out); + break; + + case 9: + ipack9(initOffset, in, out); + break; + + case 10: + ipack10(initOffset, in, out); + break; + + case 11: + ipack11(initOffset, in, out); + break; + + case 12: + ipack12(initOffset, in, out); + break; + + case 13: + ipack13(initOffset, in, out); + break; + + case 14: + ipack14(initOffset, in, out); + break; + + case 15: + ipack15(initOffset, in, out); + break; + + case 16: + ipack16(initOffset, in, out); + break; + + case 17: + ipack17(initOffset, in, out); + break; + + case 18: + ipack18(initOffset, in, out); + break; + + case 19: + ipack19(initOffset, in, out); + break; + + case 20: + ipack20(initOffset, in, out); + break; + + case 21: + ipack21(initOffset, in, out); + break; + + case 22: + ipack22(initOffset, in, out); + break; + + case 23: + ipack23(initOffset, in, out); + break; + + case 24: + ipack24(initOffset, in, out); + break; + + case 25: + ipack25(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 26: + ipack26(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 27: + ipack27(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 28: + ipack28(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 29: + ipack29(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 30: + ipack30(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + case 31: + ipack31(initOffset, in, out); + break; - ++out; - ++in; - InReg = _mm_loadu_si128(in); + case 32: + ipack32(initOffset, in, out); + break; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); + default: + break; + } +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - - - -__m128i iunpack1(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<1)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack2(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<2)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack3(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<3)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack4(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<4)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack5(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<5)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack6(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<6)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack7(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<7)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack8(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<8)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack9(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<9)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack10(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<10)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack11(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<11)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack12(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<12)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack13(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<13)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack14(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<14)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack15(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<15)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack16(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<16)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack17(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<17)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack18(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<18)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack19(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<19)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack20(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<20)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack21(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<21)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack22(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<22)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack23(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<23)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack24(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<24)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack25(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<25)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack26(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<26)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack27(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<27)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack28(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<28)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack29(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<29)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack30(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<30)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack31(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<31)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -__m128i iunpack32(__m128i initOffset, const __m128i* in, uint32_t * _out) { - __m128i * mout = (__m128i *)(_out); - __m128i invec; - for(size_t k = 0; k < 128/4; ++k) { - invec = _mm_loadu_si128(in++); - _mm_storeu_si128(mout++, invec); - } - return invec; -} - - - - - void simdunpackd1(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: iunpack0(initOffset,in,out); break; - - case 1: iunpack1(initOffset,in,out); break; - - case 2: iunpack2(initOffset,in,out); break; - - case 3: iunpack3(initOffset,in,out); break; - - case 4: iunpack4(initOffset,in,out); break; - - case 5: iunpack5(initOffset,in,out); break; - - case 6: iunpack6(initOffset,in,out); break; - - case 7: iunpack7(initOffset,in,out); break; - - case 8: iunpack8(initOffset,in,out); break; - - case 9: iunpack9(initOffset,in,out); break; - - case 10: iunpack10(initOffset,in,out); break; - - case 11: iunpack11(initOffset,in,out); break; - - case 12: iunpack12(initOffset,in,out); break; - - case 13: iunpack13(initOffset,in,out); break; - - case 14: iunpack14(initOffset,in,out); break; - - case 15: iunpack15(initOffset,in,out); break; - - case 16: iunpack16(initOffset,in,out); break; - - case 17: iunpack17(initOffset,in,out); break; - - case 18: iunpack18(initOffset,in,out); break; - - case 19: iunpack19(initOffset,in,out); break; - - case 20: iunpack20(initOffset,in,out); break; - - case 21: iunpack21(initOffset,in,out); break; - - case 22: iunpack22(initOffset,in,out); break; - - case 23: iunpack23(initOffset,in,out); break; - - case 24: iunpack24(initOffset,in,out); break; - - case 25: iunpack25(initOffset,in,out); break; - - case 26: iunpack26(initOffset,in,out); break; - - case 27: iunpack27(initOffset,in,out); break; - - case 28: iunpack28(initOffset,in,out); break; - - case 29: iunpack29(initOffset,in,out); break; - - case 30: iunpack30(initOffset,in,out); break; - - case 31: iunpack31(initOffset,in,out); break; - - case 32: iunpack32(initOffset,in,out); break; - - default: break; - } -} - - - - /*assumes that integers fit in the prescribed number of bits*/ - -void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: break; - - case 1: ipackwithoutmask1(initOffset,in,out); break; - - case 2: ipackwithoutmask2(initOffset,in,out); break; - - case 3: ipackwithoutmask3(initOffset,in,out); break; - - case 4: ipackwithoutmask4(initOffset,in,out); break; - - case 5: ipackwithoutmask5(initOffset,in,out); break; - - case 6: ipackwithoutmask6(initOffset,in,out); break; - - case 7: ipackwithoutmask7(initOffset,in,out); break; - - case 8: ipackwithoutmask8(initOffset,in,out); break; - - case 9: ipackwithoutmask9(initOffset,in,out); break; - - case 10: ipackwithoutmask10(initOffset,in,out); break; - - case 11: ipackwithoutmask11(initOffset,in,out); break; - - case 12: ipackwithoutmask12(initOffset,in,out); break; - - case 13: ipackwithoutmask13(initOffset,in,out); break; - - case 14: ipackwithoutmask14(initOffset,in,out); break; - - case 15: ipackwithoutmask15(initOffset,in,out); break; - - case 16: ipackwithoutmask16(initOffset,in,out); break; - - case 17: ipackwithoutmask17(initOffset,in,out); break; - - case 18: ipackwithoutmask18(initOffset,in,out); break; - - case 19: ipackwithoutmask19(initOffset,in,out); break; - - case 20: ipackwithoutmask20(initOffset,in,out); break; - - case 21: ipackwithoutmask21(initOffset,in,out); break; - - case 22: ipackwithoutmask22(initOffset,in,out); break; - - case 23: ipackwithoutmask23(initOffset,in,out); break; - - case 24: ipackwithoutmask24(initOffset,in,out); break; - - case 25: ipackwithoutmask25(initOffset,in,out); break; - - case 26: ipackwithoutmask26(initOffset,in,out); break; - - case 27: ipackwithoutmask27(initOffset,in,out); break; - - case 28: ipackwithoutmask28(initOffset,in,out); break; - - case 29: ipackwithoutmask29(initOffset,in,out); break; - - case 30: ipackwithoutmask30(initOffset,in,out); break; - - case 31: ipackwithoutmask31(initOffset,in,out); break; - - case 32: ipackwithoutmask32(initOffset,in,out); break; - - default: break; - } +void simdfastsetd1fromprevious(__m128i *in, uint32_t bit, + uint32_t previousvalue, uint32_t value, + size_t index) { + simdfastset(in, bit, value - previousvalue, index); } +#ifdef __SSE4_1__ - - -void simdpackd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: break;; - - case 1: ipack1(initOffset, in,out); break; - - case 2: ipack2(initOffset, in,out); break; - - case 3: ipack3(initOffset, in,out); break; - - case 4: ipack4(initOffset, in,out); break; - - case 5: ipack5(initOffset, in,out); break; - - case 6: ipack6(initOffset, in,out); break; - - case 7: ipack7(initOffset, in,out); break; - - case 8: ipack8(initOffset, in,out); break; - - case 9: ipack9(initOffset, in,out); break; - - case 10: ipack10(initOffset, in,out); break; - - case 11: ipack11(initOffset, in,out); break; - - case 12: ipack12(initOffset, in,out); break; - - case 13: ipack13(initOffset, in,out); break; - - case 14: ipack14(initOffset, in,out); break; - - case 15: ipack15(initOffset, in,out); break; - - case 16: ipack16(initOffset, in,out); break; - - case 17: ipack17(initOffset, in,out); break; - - case 18: ipack18(initOffset, in,out); break; - - case 19: ipack19(initOffset, in,out); break; - - case 20: ipack20(initOffset, in,out); break; - - case 21: ipack21(initOffset, in,out); break; - - case 22: ipack22(initOffset, in,out); break; - - case 23: ipack23(initOffset, in,out); break; - - case 24: ipack24(initOffset, in,out); break; - - case 25: ipack25(initOffset, in,out); break; - - case 26: ipack26(initOffset, in,out); break; - - case 27: ipack27(initOffset, in,out); break; - - case 28: ipack28(initOffset, in,out); break; - - case 29: ipack29(initOffset, in,out); break; - - case 30: ipack30(initOffset, in,out); break; - - case 31: ipack31(initOffset, in,out); break; - - case 32: ipack32(initOffset, in,out); break; - - default: break; - } +void simdfastsetd1(uint32_t initvalue, __m128i *in, uint32_t bit, + uint32_t value, size_t index) { + if (index == 0) { + simdfastset(in, bit, value - initvalue, index); + } else { + uint32_t prev = simdselectd1(initvalue, in, bit, index - 1); + simdfastset(in, bit, value - prev, index); + } } +#endif diff --git a/src/simdpackedsearch.c b/src/simdpackedsearch.c new file mode 100644 index 0000000..c4aaf53 --- /dev/null +++ b/src/simdpackedsearch.c @@ -0,0 +1,16691 @@ +/** + * This code is released under a BSD License. + */ +#ifdef __SSE4_1__ + +#include "simdintegratedbitpacking.h" +#include + +SIMDCOMP_ALIGNED(16) +static int8_t shuffle_mask_bytes[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, + 15, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, + 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, + 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, + 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; +static const __m128i *shuffle_mask = (__m128i *)shuffle_mask_bytes; + +/* should emulate std:lower_bound*/ +static int lower_bound(uint32_t *A, uint32_t key, int imin, int imax) { + int imid; + imax--; + while (imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] >= key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; + } + } + if (A[imin] >= key) + return imin; + return imax; +} + +#define PrefixSum(ret, curr, prev) \ + do { \ + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ + ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ + } while (0) + +/* perform a lower-bound search for |key| in |out|; the resulting uint32 + * is stored in |*presult|.*/ +#define CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult) \ + do { \ + __m128i tmpout = _mm_sub_epi32(out, conversion); \ + uint32_t mmask = \ + _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ + if (mmask != 15) { \ + const __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mmask ^ 15]); \ + int offset; \ + int remaining = length - i; \ + SIMDCOMP_CTZ(offset, mmask ^ 15); \ + *presult = _mm_cvtsi128_si32(p); \ + if (offset < remaining) \ + return (i + offset); \ + } \ + i += 4; \ + if (i >= length) { /* reached end of array? */ \ + *presult = key + 1; \ + return (length); \ + } \ + } while (0) + +static int iunpacksearchwithlength0(__m128i initOffset, const __m128i *_in, + int length, uint32_t key, + uint32_t *presult) { + if (length > 0) { + uint32_t repeatedvalue = (uint32_t)_mm_extract_epi32(initOffset, 3); + if (repeatedvalue >= key) { + *presult = repeatedvalue; + return 0; + } + } + (void)_in; + *presult = key + 1; + return (length); +} + +static int iunpacksearchwithlength1(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 1) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength2(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 2) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength3(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 3) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength4(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 4) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength5(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 5) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength6(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 6) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength7(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 7) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength8(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 8) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength9(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 9) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength10(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 10) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength11(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 11) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength12(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 12) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength13(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 13) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength14(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 14) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength15(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 15) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength16(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 16) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength17(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 17) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength18(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 18) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength19(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 19) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength20(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 20) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength21(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 21) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength22(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 22) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength23(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 23) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength24(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 24) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength25(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 25) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength26(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 26) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength27(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 27) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength28(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 28) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength29(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 29) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength30(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 30) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength31(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 31) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearchwithlength32(__m128i initOffset, const __m128i *in, + int length, uint32_t key, + uint32_t *presult) { + uint32_t *in32 = (uint32_t *)in; + int answer = lower_bound(in32, key, 0, length); + if (in32[answer] < key) { + *presult = key + 1; + return (length); + } + (void)initOffset; + *presult = in32[answer]; + return answer; +} + +int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + return iunpacksearchwithlength0(initOffset, in, length, key, presult); + + case 1: + return iunpacksearchwithlength1(initOffset, in, length, key, presult); + + case 2: + return iunpacksearchwithlength2(initOffset, in, length, key, presult); + + case 3: + return iunpacksearchwithlength3(initOffset, in, length, key, presult); + + case 4: + return iunpacksearchwithlength4(initOffset, in, length, key, presult); + + case 5: + return iunpacksearchwithlength5(initOffset, in, length, key, presult); + + case 6: + return iunpacksearchwithlength6(initOffset, in, length, key, presult); + + case 7: + return iunpacksearchwithlength7(initOffset, in, length, key, presult); + + case 8: + return iunpacksearchwithlength8(initOffset, in, length, key, presult); + + case 9: + return iunpacksearchwithlength9(initOffset, in, length, key, presult); + + case 10: + return iunpacksearchwithlength10(initOffset, in, length, key, presult); + + case 11: + return iunpacksearchwithlength11(initOffset, in, length, key, presult); + + case 12: + return iunpacksearchwithlength12(initOffset, in, length, key, presult); + + case 13: + return iunpacksearchwithlength13(initOffset, in, length, key, presult); + + case 14: + return iunpacksearchwithlength14(initOffset, in, length, key, presult); + + case 15: + return iunpacksearchwithlength15(initOffset, in, length, key, presult); + + case 16: + return iunpacksearchwithlength16(initOffset, in, length, key, presult); + + case 17: + return iunpacksearchwithlength17(initOffset, in, length, key, presult); + + case 18: + return iunpacksearchwithlength18(initOffset, in, length, key, presult); + + case 19: + return iunpacksearchwithlength19(initOffset, in, length, key, presult); + + case 20: + return iunpacksearchwithlength20(initOffset, in, length, key, presult); + + case 21: + return iunpacksearchwithlength21(initOffset, in, length, key, presult); + + case 22: + return iunpacksearchwithlength22(initOffset, in, length, key, presult); + + case 23: + return iunpacksearchwithlength23(initOffset, in, length, key, presult); + + case 24: + return iunpacksearchwithlength24(initOffset, in, length, key, presult); + + case 25: + return iunpacksearchwithlength25(initOffset, in, length, key, presult); + + case 26: + return iunpacksearchwithlength26(initOffset, in, length, key, presult); + + case 27: + return iunpacksearchwithlength27(initOffset, in, length, key, presult); + + case 28: + return iunpacksearchwithlength28(initOffset, in, length, key, presult); + + case 29: + return iunpacksearchwithlength29(initOffset, in, length, key, presult); + + case 30: + return iunpacksearchwithlength30(initOffset, in, length, key, presult); + + case 31: + return iunpacksearchwithlength31(initOffset, in, length, key, presult); + + case 32: + return iunpacksearchwithlength32(initOffset, in, length, key, presult); + + default: + break; + } + return (-1); +} + +/* perform a lower-bound search for |key| in |out|; the resulting uint32 + * is stored in |*presult|.*/ +#define CHECK_AND_INCREMENT(i, out, key, presult) \ + do { \ + __m128i tmpout = _mm_sub_epi32(out, conversion); \ + uint32_t mmask = \ + _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ + if (mmask != 15) { \ + __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mmask ^ 15]); \ + int offset; \ + SIMDCOMP_CTZ(offset, mmask ^ 15); \ + *presult = _mm_cvtsi128_si32(p); \ + return (i + offset); \ + } \ + i += 4; \ + } while (0) + +static int iunpacksearch0(__m128i *initOffset, const __m128i *_in, uint32_t key, + uint32_t *presult) { + uint32_t repeatedvalue = (uint32_t)_mm_extract_epi32(*initOffset, 3); + if (repeatedvalue >= key) { + *presult = repeatedvalue; + return 0; + } + *presult = key + 1; + (void)_in; + return (128); +} + +static int iunpacksearch1(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 1) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch2(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 2) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch3(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 3) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch4(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 4) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch5(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 5) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch6(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 6) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch7(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 7) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch8(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 8) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch9(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 9) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch10(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 10) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch11(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 11) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch12(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 12) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch13(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 13) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch14(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 14) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch15(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 15) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch16(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 16) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch17(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 17) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch18(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 18) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch19(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 19) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch20(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 20) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch21(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 21) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch22(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 22) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch23(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 23) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch24(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 24) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch25(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 25) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch26(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 26) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch27(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 27) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch28(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 28) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch29(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 29) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch30(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 30) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch31(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 31) - 1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg, 1); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int iunpacksearch32(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + uint32_t *in32 = (uint32_t *)in; + int answer = lower_bound(in32, key, 0, 128); + if (in32[answer] < key) { + *presult = key + 1; + return (128); + } + *presult = in32[answer]; + *initOffset = _mm_load_si128(in + 31); + return answer; +} + +int simdsearchd1(__m128i *initOffset, const __m128i *in, uint32_t bit, + uint32_t key, uint32_t *presult) { + switch (bit) { + case 0: + return iunpacksearch0(initOffset, in, key, presult); + + case 1: + return iunpacksearch1(initOffset, in, key, presult); + + case 2: + return iunpacksearch2(initOffset, in, key, presult); + + case 3: + return iunpacksearch3(initOffset, in, key, presult); + + case 4: + return iunpacksearch4(initOffset, in, key, presult); + + case 5: + return iunpacksearch5(initOffset, in, key, presult); + + case 6: + return iunpacksearch6(initOffset, in, key, presult); + + case 7: + return iunpacksearch7(initOffset, in, key, presult); + + case 8: + return iunpacksearch8(initOffset, in, key, presult); + + case 9: + return iunpacksearch9(initOffset, in, key, presult); + + case 10: + return iunpacksearch10(initOffset, in, key, presult); + + case 11: + return iunpacksearch11(initOffset, in, key, presult); + + case 12: + return iunpacksearch12(initOffset, in, key, presult); + + case 13: + return iunpacksearch13(initOffset, in, key, presult); + + case 14: + return iunpacksearch14(initOffset, in, key, presult); + + case 15: + return iunpacksearch15(initOffset, in, key, presult); + + case 16: + return iunpacksearch16(initOffset, in, key, presult); + + case 17: + return iunpacksearch17(initOffset, in, key, presult); + + case 18: + return iunpacksearch18(initOffset, in, key, presult); + + case 19: + return iunpacksearch19(initOffset, in, key, presult); + + case 20: + return iunpacksearch20(initOffset, in, key, presult); + + case 21: + return iunpacksearch21(initOffset, in, key, presult); + + case 22: + return iunpacksearch22(initOffset, in, key, presult); + + case 23: + return iunpacksearch23(initOffset, in, key, presult); + + case 24: + return iunpacksearch24(initOffset, in, key, presult); + + case 25: + return iunpacksearch25(initOffset, in, key, presult); + + case 26: + return iunpacksearch26(initOffset, in, key, presult); + + case 27: + return iunpacksearch27(initOffset, in, key, presult); + + case 28: + return iunpacksearch28(initOffset, in, key, presult); + + case 29: + return iunpacksearch29(initOffset, in, key, presult); + + case 30: + return iunpacksearch30(initOffset, in, key, presult); + + case 31: + return iunpacksearch31(initOffset, in, key, presult); + + case 32: + return iunpacksearch32(initOffset, in, key, presult); + + default: + break; + } + return (-1); +} + +#endif diff --git a/src/simdpackedselect.c b/src/simdpackedselect.c new file mode 100644 index 0000000..e81f66a --- /dev/null +++ b/src/simdpackedselect.c @@ -0,0 +1,15357 @@ +/** + * This code is released under a BSD License. + */ +#ifdef __SSE4_1__ +#include "simdintegratedbitpacking.h" +#include + +SIMDCOMP_ALIGNED(16) +int8_t shuffle_mask_bytes[256] = { + 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const __m128i *shuffle_mask = (__m128i *)shuffle_mask_bytes; + +uint32_t branchlessextract(__m128i out, int i) { + return _mm_cvtsi128_si32(_mm_shuffle_epi8(out, shuffle_mask[i])); +} + +#define PrefixSum(ret, curr, prev) \ + do { \ + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ + ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ + } while (0) + +#define CHECK_AND_INCREMENT(i, out, slot) \ + i += 4; \ + if (i > slot) { \ + return branchlessextract(out, slot - (i - 4)); \ + } + +static uint32_t iunpackselect1(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 1) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect2(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 2) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect3(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 3) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect4(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 4) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect5(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 5) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect6(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 6) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect7(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 7) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect8(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 8) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect9(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 9) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect10(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 10) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect11(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 11) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect12(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 12) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect13(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 13) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect14(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect15(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect16(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 16) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect17(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect18(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect19(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect20(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect21(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect22(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect23(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect24(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect25(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect26(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect27(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect28(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect29(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect30(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect31(__m128i *initOffset, const __m128i *in, + int slot) { + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg, 1); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + return (0); +} + +static uint32_t iunpackselect32(__m128i *initOffset, const __m128i *in, + int slot) { + uint32_t *begin = (uint32_t *)in; + *initOffset = _mm_load_si128(in + 31); + return begin[slot]; +} + +uint32_t simdselectd1(uint32_t init, const __m128i *in, uint32_t bit, + int slot) { + __m128i vecinitOffset = _mm_set1_epi32(init); + __m128i *initOffset = &vecinitOffset; + slot &= 127; /* to avoid problems */ + + switch (bit) { + case 0: + return _mm_extract_epi32(*initOffset, 3); + break; + + case 1: + return iunpackselect1(initOffset, in, slot); + break; + + case 2: + return iunpackselect2(initOffset, in, slot); + break; + + case 3: + return iunpackselect3(initOffset, in, slot); + break; + + case 4: + return iunpackselect4(initOffset, in, slot); + break; + + case 5: + return iunpackselect5(initOffset, in, slot); + break; + + case 6: + return iunpackselect6(initOffset, in, slot); + break; + + case 7: + return iunpackselect7(initOffset, in, slot); + break; + + case 8: + return iunpackselect8(initOffset, in, slot); + break; + + case 9: + return iunpackselect9(initOffset, in, slot); + break; + + case 10: + return iunpackselect10(initOffset, in, slot); + break; + + case 11: + return iunpackselect11(initOffset, in, slot); + break; + + case 12: + return iunpackselect12(initOffset, in, slot); + break; + + case 13: + return iunpackselect13(initOffset, in, slot); + break; + + case 14: + return iunpackselect14(initOffset, in, slot); + break; + + case 15: + return iunpackselect15(initOffset, in, slot); + break; + + case 16: + return iunpackselect16(initOffset, in, slot); + break; + + case 17: + return iunpackselect17(initOffset, in, slot); + break; + + case 18: + return iunpackselect18(initOffset, in, slot); + break; + + case 19: + return iunpackselect19(initOffset, in, slot); + break; + + case 20: + return iunpackselect20(initOffset, in, slot); + break; + + case 21: + return iunpackselect21(initOffset, in, slot); + break; + + case 22: + return iunpackselect22(initOffset, in, slot); + break; + + case 23: + return iunpackselect23(initOffset, in, slot); + break; + + case 24: + return iunpackselect24(initOffset, in, slot); + break; + + case 25: + return iunpackselect25(initOffset, in, slot); + break; + + case 26: + return iunpackselect26(initOffset, in, slot); + break; + + case 27: + return iunpackselect27(initOffset, in, slot); + break; + + case 28: + return iunpackselect28(initOffset, in, slot); + break; + + case 29: + return iunpackselect29(initOffset, in, slot); + break; + + case 30: + return iunpackselect30(initOffset, in, slot); + break; + + case 31: + return iunpackselect31(initOffset, in, slot); + break; + + case 32: + return iunpackselect32(initOffset, in, slot); + break; + + default: + break; + } + + return (-1); +} + +static void iunpackscan1(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 1) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan2(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 2) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan3(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 3) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan4(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 4) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan5(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 5) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan6(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 6) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan7(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 7) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan8(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 8) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan9(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 9) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan10(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 10) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan11(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 11) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan12(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 12) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan13(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 13) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan14(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan15(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan16(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 16) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan17(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan18(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan19(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan20(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan21(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan22(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan23(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan24(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan25(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan26(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan27(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan28(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan29(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan30(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan31(__m128i *initOffset, const __m128i *in) { + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 31); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 30); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 29); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 28); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 27); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 26); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 25); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 24); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 23); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 22); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 21); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 20); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 19); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 18); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 17); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 16); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 15); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 14); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 13); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 12); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 11); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 10); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 9); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 8); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 7); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 6); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 5); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 4); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 3); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 2); + out = tmp; + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + tmp = _mm_srli_epi32(InReg, 1); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; +} + +static void iunpackscan32(__m128i *initOffset, const __m128i *in) { + *initOffset = _mm_load_si128(in + 31); +} + +void simdscand1(__m128i *initOffset, const __m128i *in, uint32_t bit) { + switch (bit) { + case 0: + return; + break; + + case 1: + iunpackscan1(initOffset, in); + break; + + case 2: + iunpackscan2(initOffset, in); + break; + + case 3: + iunpackscan3(initOffset, in); + break; + + case 4: + iunpackscan4(initOffset, in); + break; + + case 5: + iunpackscan5(initOffset, in); + break; + + case 6: + iunpackscan6(initOffset, in); + break; + + case 7: + iunpackscan7(initOffset, in); + break; + + case 8: + iunpackscan8(initOffset, in); + break; + + case 9: + iunpackscan9(initOffset, in); + break; + + case 10: + iunpackscan10(initOffset, in); + break; + + case 11: + iunpackscan11(initOffset, in); + break; + + case 12: + iunpackscan12(initOffset, in); + break; + + case 13: + iunpackscan13(initOffset, in); + break; + + case 14: + iunpackscan14(initOffset, in); + break; + + case 15: + iunpackscan15(initOffset, in); + break; + + case 16: + iunpackscan16(initOffset, in); + break; + + case 17: + iunpackscan17(initOffset, in); + break; + + case 18: + iunpackscan18(initOffset, in); + break; + + case 19: + iunpackscan19(initOffset, in); + break; + + case 20: + iunpackscan20(initOffset, in); + break; + + case 21: + iunpackscan21(initOffset, in); + break; + + case 22: + iunpackscan22(initOffset, in); + break; + + case 23: + iunpackscan23(initOffset, in); + break; + + case 24: + iunpackscan24(initOffset, in); + break; + + case 25: + iunpackscan25(initOffset, in); + break; + + case 26: + iunpackscan26(initOffset, in); + break; + + case 27: + iunpackscan27(initOffset, in); + break; + + case 28: + iunpackscan28(initOffset, in); + break; + + case 29: + iunpackscan29(initOffset, in); + break; + + case 30: + iunpackscan30(initOffset, in); + break; + + case 31: + iunpackscan31(initOffset, in); + break; + + case 32: + iunpackscan32(initOffset, in); + break; + + default: + break; + } + + return; +} + +#endif diff --git a/src/unit.c b/src/unit.c deleted file mode 100644 index 63339ae..0000000 --- a/src/unit.c +++ /dev/null @@ -1,62 +0,0 @@ -/** - * This code is released under a BSD License. - */ -#include -#include "simdcomp.h" - - -int main() { - int N = 5000 * SIMDBlockSize; - __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); - uint32_t * datain = malloc(N * sizeof(uint32_t)); - uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); - for (int gap = 1; gap <= 387420489; gap *= 3) { - printf(" gap = %u \n", gap); - for (int k = 0; k < N; ++k) - datain[k] = k * gap; - uint32_t offset = 0; - for (int k = 0; k * SIMDBlockSize < N; ++k) { - ///////////////////////////// - // First part works for general arrays (sorted or unsorted) - ///////////////////////////// - // we compute the bit width - const uint32_t b = maxbits(datain + k * SIMDBlockSize); - // we read 128 integers at "datain + k * SIMDBlockSize" and - // write b 128-bit vectors at "buffer" - simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); - // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer - simdunpack(buffer, backbuffer, b);//uncompressed - for (int j = 0; j < SIMDBlockSize; ++j) { - if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { - printf("bug in simdpack\n"); - return -2; - } - } - ///////////////////////////// - // next part assumes that the data is sorted (uses differential coding) - ///////////////////////////// - // we compute the bit width - const uint32_t b1 = simdmaxbitsd1(offset, - datain + k * SIMDBlockSize); - // we read 128 integers at "datain + k * SIMDBlockSize" and - // write b1 128-bit vectors at "buffer" - simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, - b1); - // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer - simdunpackd1(offset, buffer, backbuffer, b1); - for (int j = 0; j < SIMDBlockSize; ++j) { - if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { - printf("bug in simdpack d1\n"); - return -3; - } - } - offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; - - } - } - free(buffer); - free(datain); - free(backbuffer); - printf("Code looks good.\n"); - return 0; -} diff --git a/tests/unit.c b/tests/unit.c new file mode 100644 index 0000000..f24a263 --- /dev/null +++ b/tests/unit.c @@ -0,0 +1,1109 @@ +/** + * This code is released under a BSD License. + */ +#include "simdcomp.h" +#include +#include +#include + +int issue21() { + uint32_t bw, sz; + printf("issue21"); + fflush(stdout); + for (bw = 0; bw < 30; bw++) { + printf("."); + fflush(stdout); + for (sz = 1; sz < 4096; sz++) { + + size_t i; + uint32_t *in = malloc(sz * sizeof(uint32_t)); + uint32_t *out = malloc(sz * sizeof(uint32_t)); + for (i = 0; i < sz; ++i) + in[i] = (1 << bw) - 1; + uint32_t b = maxbits_length(in, sz); + uint8_t *buf = malloc(simdpack_compressedbytes(sz, b)); + __m128i *end = simdpack_length(in, sz, (__m128i *)buf, b); + if ((uint8_t *)end - buf != simdpack_compressedbytes(sz, b)) { + printf("bad mem usage\n"); + return -1; + } + simdunpack_length((const __m128i *)buf, sz, out, b); + for (i = 0; i < sz; ++i) { + if (in[i] != out[i]) { + printf("bug\n"); + return -1; + } + } + free(in); + free(out); + free(buf); + } + } + printf("\n"); + return 0; +} + +int issue21FOR() { + uint32_t bw, sz; + size_t i, j; + printf("issue21for"); + fflush(stdout); + for (bw = 0; bw < 30; bw++) { + printf("."); + fflush(stdout); + for (sz = 1; sz < 4096; sz++) { + + uint32_t *in = malloc(sz * sizeof(uint32_t)); + uint32_t *out = malloc(sz * sizeof(uint32_t)); + in[0] = 0; + for (i = 1; i < sz; ++i) + in[i] = (1 << bw) - 1; + uint32_t b = maxbits_length(in, sz); + uint8_t *buf = malloc(simdpackFOR_compressedbytes(sz, b)); + __m128i *end = simdpackFOR_length(0, in, sz, (__m128i *)buf, b); + if ((uint8_t *)end - buf != simdpackFOR_compressedbytes(sz, b)) { + printf("bad mem usage\n"); + return -1; + } + simdunpackFOR_length(0, (const __m128i *)buf, sz, out, b); + for (i = 0; i < sz; ++i) { + if (in[i] != out[i]) { + for (j = 0; j < sz; ++j) { + printf("%zu : %u %u \n", j, in[j], out[j]); + } + printf("bug\n"); + return -1; + } + } + free(in); + free(out); + free(buf); + } + } + printf("\n"); + return 0; +} + +int testshortpack() { + int bit; + size_t i; + size_t length; + __m128i *bb; + srand(0); + printf("[%s]\n", __func__); + for (bit = 0; bit < 32; ++bit) { + printf(" %d ", bit); + fflush(stdout); + const size_t N = 128; + uint32_t *data = malloc(N * sizeof(uint32_t)); + uint32_t *backdata = malloc(N * sizeof(uint32_t)); + uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((1U << bit) - 1); + } + for (length = 0; length <= N; ++length) { + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + bb = simdpack_shortlength(data, length, (__m128i *)buffer, bit); + if ((bb - (__m128i *)buffer) * sizeof(__m128i) != + (unsigned)simdpack_compressedbytes(length, bit)) { + printf("bug\n"); + return -1; + } + simdunpack_shortlength((__m128i *)buffer, length, backdata, bit); + for (i = 0; i < length; ++i) { + + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + } + free(data); + free(backdata); + free(buffer); + } + return 0; +} + +int testlongpack() { + int bit; + size_t i; + size_t length; + __m128i *bb; + srand(0); + printf("[%s]\n", __func__); + for (bit = 0; bit < 32; ++bit) { + const size_t N = 2048; + uint32_t *data = malloc(N * sizeof(uint32_t)); + uint32_t *backdata = malloc(N * sizeof(uint32_t)); + uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((1U << bit) - 1); + } + for (length = 0; length <= N; ++length) { + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + bb = simdpack_length(data, length, (__m128i *)buffer, bit); + if ((bb - (__m128i *)buffer) * sizeof(__m128i) != + (unsigned)simdpack_compressedbytes(length, bit)) { + printf("bug\n"); + return -1; + } + simdunpack_length((__m128i *)buffer, length, backdata, bit); + for (i = 0; i < length; ++i) { + + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + } + free(data); + free(backdata); + free(buffer); + } + return 0; +} + +int testset() { + int bit; + size_t i; + const size_t N = 128; + uint32_t *data = malloc(N * sizeof(uint32_t)); + uint32_t *backdata = malloc(N * sizeof(uint32_t)); + uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + srand(0); + printf("[%s]\n", __func__); + for (bit = 0; bit < 32; ++bit) { + printf("simple set %d \n", bit); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((1U << bit) - 1); + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + simdpack(data, (__m128i *)buffer, bit); + simdunpack((__m128i *)buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + + for (i = N; i > 0; i--) { + simdfastset((__m128i *)buffer, bit, data[N - i], i - 1); + } + simdunpack((__m128i *)buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[N - i - 1]) { + printf("bug\n"); + return -1; + } + } + simdpack(data, (__m128i *)buffer, bit); + for (i = 1; i <= N; i++) { + simdfastset((__m128i *)buffer, bit, data[i - 1], i - 1); + } + simdunpack((__m128i *)buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + } + free(data); + free(backdata); + free(buffer); + + return 0; +} + +#ifdef __SSE4_1__ + +int testsetd1() { + int bit; + size_t i; + uint32_t newvalue; + const size_t N = 128; + uint32_t *data = malloc(N * sizeof(uint32_t)); + uint32_t *datazeroes = malloc(N * sizeof(uint32_t)); + + uint32_t *backdata = malloc(N * sizeof(uint32_t)); + uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + srand(0); + printf("[%s]\n", __func__); + for (bit = 0; bit < 32; ++bit) { + printf("simple set d1 %d \n", bit); + data[0] = rand() & ((1U << bit) - 1); + datazeroes[0] = 0; + + for (i = 1; i < N; ++i) { + data[i] = data[i - 1] + (rand() & ((1U << bit) - 1)); + datazeroes[i] = 0; + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + simdpackd1(0, datazeroes, (__m128i *)buffer, bit); + for (i = 1; i <= N; i++) { + simdfastsetd1(0, (__m128i *)buffer, bit, data[i - 1], i - 1); + newvalue = simdselectd1(0, (const __m128i *)buffer, bit, i - 1); + if (newvalue != data[i - 1]) { + printf("bad set-select\n"); + return -1; + } + } + simdunpackd1(0, (__m128i *)buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) + return -1; + } + } + free(data); + free(backdata); + free(buffer); + free(datazeroes); + return 0; +} +#endif + +int testsetFOR() { + int bit; + size_t i; + uint32_t newvalue; + const size_t N = 128; + uint32_t *data = malloc(N * sizeof(uint32_t)); + uint32_t *datazeroes = malloc(N * sizeof(uint32_t)); + + uint32_t *backdata = malloc(N * sizeof(uint32_t)); + uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + srand(0); + printf("[%s]\n", __func__); + for (bit = 0; bit < 32; ++bit) { + printf("simple set FOR %d \n", bit); + for (i = 0; i < N; ++i) { + data[i] = (rand() & ((1U << bit) - 1)); + datazeroes[i] = 0; + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + simdpackFOR(0, datazeroes, (__m128i *)buffer, bit); + for (i = 1; i <= N; i++) { + simdfastsetFOR(0, (__m128i *)buffer, bit, data[i - 1], i - 1); + newvalue = simdselectFOR(0, (const __m128i *)buffer, bit, i - 1); + if (newvalue != data[i - 1]) { + printf("bad set-select\n"); + return -1; + } + } + simdunpackFOR(0, (__m128i *)buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) + return -1; + } + } + free(data); + free(backdata); + free(buffer); + free(datazeroes); + return 0; +} + +int testshortFORpack() { + int bit; + size_t i; + __m128i *rb; + size_t length; + uint32_t offset = 7; + srand(0); + printf("[%s]\n", __func__); + for (bit = 0; bit < 32; ++bit) { + printf(" %d ", bit); + fflush(stdout); + const size_t N = 128; + uint32_t *data = malloc(N * sizeof(uint32_t)); + uint32_t *backdata = malloc(N * sizeof(uint32_t)); + uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + for (i = 0; i < N; ++i) { + data[i] = (rand() & ((1U << bit) - 1)) + offset; + } + for (length = 0; length <= N; ++length) { + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + rb = simdpackFOR_length(offset, data, length, (__m128i *)buffer, bit); + if (((rb - (__m128i *)buffer) * sizeof(__m128i)) != + (unsigned)simdpackFOR_compressedbytes(length, bit)) { + return -1; + } + simdunpackFOR_length(offset, (__m128i *)buffer, length, backdata, bit); + for (i = 0; i < length; ++i) { + + if (data[i] != backdata[i]) + return -1; + } + } + free(data); + free(backdata); + free(buffer); + } + return 0; +} + +#ifdef __AVX2__ + +int testbabyavx() { + int bit; + int trial; + unsigned int i, j; + const size_t N = AVXBlockSize; + srand(0); + printf("[%s]\n", __func__); + printf("bit = "); + for (bit = 0; bit < 32; ++bit) { + printf(" %d ", bit); + fflush(stdout); + for (trial = 0; trial < 100; ++trial) { + uint32_t *data = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); + uint32_t *backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); + __m256i *buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((uint32_t)(1U << bit) - 1); + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + if (avxmaxbits(data) != maxbits_length(data, N)) { + printf("avxmaxbits is buggy\n"); + return -1; + } + + avxpackwithoutmask(data, buffer, bit); + avxunpack(buffer, backdata, bit); + for (i = 0; i < AVXBlockSize; ++i) { + if (data[i] != backdata[i]) { + printf("bug\n"); + for (j = 0; j < N; ++j) { + if (data[j] != backdata[j]) { + printf("data[%d]=%d v.s. backdata[%d]=%d\n", j, data[j], j, + backdata[j]); + } else { + printf("data[%d]=%d\n", j, data[j]); + } + } + return -1; + } + } + free(data); + free(backdata); + free(buffer); + } + } + printf("\n"); + return 0; +} + +int testavx2() { + int N = 5000 * AVXBlockSize, gap; + __m256i *buffer = malloc(AVXBlockSize * sizeof(uint32_t)); + uint32_t *datain = malloc(N * sizeof(uint32_t)); + uint32_t *backbuffer = malloc(AVXBlockSize * sizeof(uint32_t)); + printf("[%s]\n", __func__); + for (gap = 1; gap <= 387420489; gap *= 3) { + int k; + printf(" gap = %u \n", gap); + for (k = 0; k < N; ++k) + datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF); + for (k = 0; k * AVXBlockSize < N; ++k) { + /* + First part works for general arrays (sorted or unsorted) + */ + int j; + /* we compute the bit width */ + const uint32_t b = avxmaxbits(datain + k * AVXBlockSize); + if (avxmaxbits(datain + k * AVXBlockSize) != + maxbits_length(datain + k * AVXBlockSize, AVXBlockSize)) { + printf("avxmaxbits is buggy %d %d \n", + avxmaxbits(datain + k * AVXBlockSize), + maxbits_length(datain + k * AVXBlockSize, AVXBlockSize)); + return -1; + } + + /* we read 256 integers at "datain + k * AVXBlockSize" and + write b 256-bit vectors at "buffer" */ + avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at + * backbuffer */ + avxunpack(buffer, backbuffer, b); /* uncompressed */ + for (j = 0; j < AVXBlockSize; ++j) { + if (backbuffer[j] != datain[k * AVXBlockSize + j]) { + int i; + printf("bug in avxpack\n"); + for (i = 0; i < AVXBlockSize; ++i) { + printf("data[%d]=%d got back %d %s\n", i, + datain[k * AVXBlockSize + i], backbuffer[i], + datain[k * AVXBlockSize + i] != backbuffer[i] ? "bug" : ""); + } + return -2; + } + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} +#endif /* avx2 */ + +#ifdef __AVX512F__ + +int testbabyavx512() { + int bit; + int trial; + unsigned int i, j; + const size_t N = AVX512BlockSize; + srand(0); + printf("[%s]\n", __func__); + printf("bit = "); + for (bit = 0; bit < 32; ++bit) { + printf(" %d ", bit); + fflush(stdout); + for (trial = 0; trial < 100; ++trial) { + uint32_t *data = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); + uint32_t *backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); + __m512i *buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((uint32_t)(1U << bit) - 1); + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + if (avx512maxbits(data) != maxbits_length(data, N)) { + printf("avx512maxbits is buggy\n"); + return -1; + } + + avx512packwithoutmask(data, buffer, bit); + avx512unpack(buffer, backdata, bit); + for (i = 0; i < AVX512BlockSize; ++i) { + if (data[i] != backdata[i]) { + printf("bug\n"); + for (j = 0; j < N; ++j) { + if (data[j] != backdata[j]) { + printf("data[%d]=%d v.s. backdata[%d]=%d\n", j, data[j], j, + backdata[j]); + } else { + printf("data[%d]=%d\n", j, data[j]); + } + } + return -1; + } + } + free(data); + free(backdata); + free(buffer); + } + } + printf("\n"); + return 0; +} + +int testavx512_2() { + int N = 5000 * AVX512BlockSize, gap; + __m512i *buffer = malloc(AVX512BlockSize * sizeof(uint32_t)); + uint32_t *datain = malloc(N * sizeof(uint32_t)); + uint32_t *backbuffer = malloc(AVX512BlockSize * sizeof(uint32_t)); + printf("[%s]\n", __func__); + for (gap = 1; gap <= 387420489; gap *= 3) { + int k; + printf(" gap = %u \n", gap); + for (k = 0; k < N; ++k) { + datain[k] = k * gap; + } + for (k = 0; k * AVX512BlockSize < N; ++k) { + /* + * First part works for general arrays (sorted or unsorted) + * */ + int j; + /* we compute the bit width */ + const uint32_t b = avx512maxbits(datain + k * AVX512BlockSize); + if (b != maxbits_length(datain + k * AVX512BlockSize, AVX512BlockSize)) { + printf("avx512maxbits is buggy %d %d \n", + avx512maxbits(datain + k * AVX512BlockSize), + maxbits_length(datain + k * AVX512BlockSize, AVX512BlockSize)); + return -1; + } + + /* we read 512 integers at "datain + k * AVX512BlockSize" and + * write b 512-bit vectors at "buffer" */ + avx512packwithoutmask(datain + k * AVX512BlockSize, buffer, b); + /* we read back b1 512-bit vectors at "buffer" and write 512 integers at + * backbuffer */ + avx512unpack(buffer, backbuffer, b); /* uncompressed */ + for (j = 0; j < AVX512BlockSize; ++j) { + if (backbuffer[j] != datain[k * AVX512BlockSize + j]) { + int i; + printf("bug in avx512pack\n"); + for (i = 0; i < AVX512BlockSize; ++i) { + printf("data[%d]=%d got back %d %s\n", i, + datain[k * AVX512BlockSize + i], backbuffer[i], + datain[k * AVX512BlockSize + i] != backbuffer[i] ? "bug" + : ""); + } + return -2; + } + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} +#endif /* avx512 */ + +int test() { + int N = 5000 * SIMDBlockSize, gap; + __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t *datain = malloc(N * sizeof(uint32_t)); + uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + printf("[%s]\n", __func__); + for (gap = 1; gap <= 387420489; gap *= 3) { + int k; + printf(" gap = %u \n", gap); + for (k = 0; k < N; ++k) + datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF); + for (k = 0; k * SIMDBlockSize < N; ++k) { + /* + First part works for general arrays (sorted or unsorted) + */ + int j; + /* we compute the bit width */ + const uint32_t b = maxbits(datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b 128-bit vectors at "buffer" */ + simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at + * backbuffer */ + simdunpack(buffer, backbuffer, b); /* uncompressed */ + for (j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack\n"); + return -2; + } + } + + { + /* + next part assumes that the data is sorted (uses differential coding) + */ + uint32_t offset = 0; + /* we compute the bit width */ + const uint32_t b1 = simdmaxbitsd1(offset, datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b1 128-bit vectors at "buffer" */ + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, b1); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at + * backbuffer */ + simdunpackd1(offset, buffer, backbuffer, b1); + for (j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack d1\n"); + return -3; + } + } + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} + +#ifdef __SSE4_1__ +int testFOR() { + int N = 5000 * SIMDBlockSize, gap; + __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t *datain = malloc(N * sizeof(uint32_t)); + uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t tmax, tmin, tb; + printf("[%s]\n", __func__); + for (gap = 1; gap <= 387420489; gap *= 2) { + int k; + printf(" gap = %u \n", gap); + for (k = 0; k < N; ++k) + datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF); + for (k = 0; k * SIMDBlockSize < N; ++k) { + int j; + simdmaxmin_length(datain + k * SIMDBlockSize, SIMDBlockSize, &tmin, + &tmax); + /* we compute the bit width */ + tb = bits(tmax - tmin); + + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b 128-bit vectors at "buffer" */ + simdpackFOR(tmin, datain + k * SIMDBlockSize, buffer, tb); + + for (j = 0; j < SIMDBlockSize; ++j) { + uint32_t selectedvalue = simdselectFOR(tmin, buffer, tb, j); + if (selectedvalue != datain[k * SIMDBlockSize + j]) { + printf("bug in simdselectFOR\n"); + return -3; + } + } + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at + * backbuffer */ + simdunpackFOR(tmin, buffer, backbuffer, tb); /* uncompressed */ + for (j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpackFOR\n"); + return -2; + } + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} +#endif + +#define MAX 300 +int test_simdmaxbitsd1_length() { + uint32_t result, buffer[MAX + 1]; + int i, j; + + memset(&buffer[0], 0xff, sizeof(buffer)); + printf("[%s]\n", __func__); + /* this test creates buffers of different length; each buffer is + * initialized to result in the following deltas: + * length 1: 2 + * length 2: 1 2 + * length 3: 1 1 2 + * length 4: 1 1 1 2 + * length 5: 1 1 1 1 2 + * etc. Each sequence's "maxbits" is 2. */ + for (i = 0; i < MAX; i++) { + for (j = 0; j < i; j++) + buffer[j] = j + 1; + buffer[i] = i + 2; + + result = simdmaxbitsd1_length(0, &buffer[0], i + 1); + if (result != 2) { + printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n", result, + i); + return -1; + } + } + printf("simdmaxbitsd1_length: ok\n"); + return 0; +} + +int uint32_cmp(const void *a, const void *b) { + const uint32_t *ia = (const uint32_t *)a; + const uint32_t *ib = (const uint32_t *)b; + if (*ia < *ib) + return -1; + else if (*ia > *ib) + return 1; + return 0; +} + +#ifdef __SSE4_1__ +int test_simdpackedsearch() { + uint32_t buffer[128]; + uint32_t result = 0; + int b, i; + uint32_t init = 0; + __m128i initial = _mm_set1_epi32(init); + printf("[%s]\n", __func__); + /* initialize the buffer */ + for (i = 0; i < 128; i++) + buffer[i] = (uint32_t)(i + 1); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 1; b <= 32; b++) { + uint32_t out[128]; + /* delta-encode to 'i' bits */ + simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b); + initial = _mm_setzero_si128(); + printf("simdsearchd1: %d bits\n", b); + + /* now perform the searches */ + initial = _mm_set1_epi32(init); + assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0); + assert(result > 0); + + for (i = 1; i <= 128; i++) { + initial = _mm_set1_epi32(init); + assert(simdsearchd1(&initial, (__m128i *)out, b, (uint32_t)i, &result) == + i - 1); + assert(result == (unsigned)i); + } + initial = _mm_set1_epi32(init); + assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result) == 128); + assert(result > 200); + } + printf("simdsearchd1: ok\n"); + return 0; +} + +int test_simdpackedsearchFOR() { + uint32_t buffer[128]; + uint32_t result = 0; + int b; + uint32_t i; + uint32_t maxv, tmin, tmax, tb; + uint32_t out[128]; + printf("[%s]\n", __func__); + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 1; b <= 32; b++) { + /* initialize the buffer */ + maxv = (b == 32) ? 0xFFFFFFFF : ((1U << b) - 1); + for (i = 0; i < 128; i++) + buffer[i] = maxv * (i + 1) / 128; + simdmaxmin_length(buffer, SIMDBlockSize, &tmin, &tmax); + /* we compute the bit width */ + tb = bits(tmax - tmin); + /* delta-encode to 'i' bits */ + simdpackFOR(tmin, buffer, (__m128i *)out, tb); + printf("simdsearchd1: %d bits\n", b); + + /* now perform the searches */ + for (i = 0; i < 128; i++) { + assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb, i)); + } + for (i = 0; i < 128; i++) { + int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb, 128, buffer[i], + &result); + assert(simdselectFOR(tmin, (__m128i *)out, tb, x) == buffer[x]); + assert(simdselectFOR(tmin, (__m128i *)out, tb, x) == result); + assert(buffer[x] == result); + assert(result == buffer[i]); + assert(buffer[x] == buffer[i]); + } + } + printf("simdsearchFOR: ok\n"); + return 0; +} + +int test_simdpackedsearch_advanced() { + uint32_t buffer[128]; + uint32_t backbuffer[128]; + uint32_t out[128]; + uint32_t result = 0; + uint32_t b, i; + uint32_t init = 0; + __m128i initial = _mm_set1_epi32(init); + + printf("[%s]\n", __func__); + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = init; + /* initialize the buffer */ + for (i = 0; i < 128; i++) { + buffer[i] = ((uint32_t)(1431655765 * i + 0xFFFFFFFF)); + if (b < 32) + buffer[i] %= (1U << b); + } + + qsort(buffer, 128, sizeof(uint32_t), uint32_cmp); + + for (i = 0; i < 128; i++) { + buffer[i] = buffer[i] + prev; + prev = buffer[i]; + } + for (i = 1; i < 128; i++) { + if (buffer[i] < buffer[i - 1]) + buffer[i] = buffer[i - 1]; + } + assert(simdmaxbitsd1(init, buffer) <= b); + for (i = 0; i < 128; i++) { + out[i] = 0; /* memset would do too */ + } + + /* delta-encode to 'i' bits */ + simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b); + simdunpackd1(init, (__m128i *)out, backbuffer, b); + + for (i = 0; i < 128; i++) { + assert(buffer[i] == backbuffer[i]); + } + + printf("advanced simdsearchd1: %d bits\n", b); + + for (i = 0; i < 128; i++) { + int pos; + initial = _mm_set1_epi32(init); + pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i], &result); + assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, + buffer[i], &result)); + assert(buffer[pos] == buffer[i]); + if (pos > 0) + assert(buffer[pos - 1] < buffer[i]); + assert(result == buffer[i]); + } + for (i = 0; i < 128; i++) { + int pos; + if (buffer[i] == 0) + continue; + initial = _mm_set1_epi32(init); + pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i] - 1, &result); + assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, + buffer[i] - 1, &result)); + assert(buffer[pos] >= buffer[i] - 1); + if (pos > 0) + assert(buffer[pos - 1] < buffer[i] - 1); + assert(result == buffer[pos]); + } + for (i = 0; i < 128; i++) { + int pos; + if (buffer[i] + 1 == 0) + continue; + initial = _mm_set1_epi32(init); + pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i] + 1, &result); + assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, + buffer[i] + 1, &result)); + if (pos == 128) { + assert(buffer[i] == buffer[127]); + } else { + assert(buffer[pos] >= buffer[i] + 1); + if (pos > 0) + assert(buffer[pos - 1] < buffer[i] + 1); + assert(result == buffer[pos]); + } + } + } + printf("advanced simdsearchd1: ok\n"); + return 0; +} + +int test_simdpackedselect() { + uint32_t buffer[128]; + uint32_t initial = 33; + int b, i; + printf("[%s]\n", __func__); + /* initialize the buffer */ + for (i = 0; i < 128; i++) + buffer[i] = (uint32_t)(initial + i); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 1; b <= 32; b++) { + uint32_t out[128]; + /* delta-encode to 'i' bits */ + simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b); + + printf("simdselectd1: %d bits\n", b); + + /* now perform the searches */ + for (i = 0; i < 128; i++) { + assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i) == + initial + i); + } + } + printf("simdselectd1: ok\n"); + return 0; +} + +int test_simdpackedselect_advanced() { + uint32_t buffer[128]; + uint32_t initial = 33; + uint32_t b; + int i; + printf("[%s]\n", __func__); + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = initial; + uint32_t out[128]; + /* initialize the buffer */ + for (i = 0; i < 128; i++) { + buffer[i] = ((uint32_t)(165576 * i)); + if (b < 32) + buffer[i] %= (1U << b); + } + for (i = 0; i < 128; i++) { + buffer[i] = buffer[i] + prev; + prev = buffer[i]; + } + + for (i = 1; i < 128; i++) { + if (buffer[i] < buffer[i - 1]) + buffer[i] = buffer[i - 1]; + } + assert(simdmaxbitsd1(initial, buffer) <= b); + + for (i = 0; i < 128; i++) { + out[i] = 0; /* memset would do too */ + } + + /* delta-encode to 'i' bits */ + simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b); + + printf("simdselectd1: %d bits\n", b); + + /* now perform the searches */ + for (i = 0; i < 128; i++) { + uint32_t valretrieved = + simdselectd1(initial, (__m128i *)out, b, (uint32_t)i); + assert(valretrieved == buffer[i]); + } + } + printf("advanced simdselectd1: ok\n"); + return 0; +} +#endif + +int main() { + int r; + r = issue21(); + if (r) { + printf("test failure issue21\n"); + return r; + } + r = issue21FOR(); + if (r) { + printf("test failure issue21FOR\n"); + return r; + } +#ifdef __AVX512F__ + r = testbabyavx512(); + if (r) { + printf("test failure baby avx512\n"); + return r; + } + + r = testavx512_2(); + if (r) { + printf("test failure 9 avx512\n"); + return r; + } +#endif + + r = testsetFOR(); + if (r) { + printf("test failure 1\n"); + return r; + } + +#ifdef __SSE4_1__ + r = testsetd1(); + if (r) { + printf("test failure 2\n"); + return r; + } +#endif + r = testset(); + if (r) { + printf("test failure 3\n"); + return r; + } + + r = testshortFORpack(); + if (r) { + printf("test failure 4\n"); + return r; + } + r = testshortpack(); + if (r) { + printf("test failure 5\n"); + return r; + } + r = testlongpack(); + if (r) { + printf("test failure 6\n"); + return r; + } +#ifdef __SSE4_1__ + r = test_simdpackedsearchFOR(); + if (r) { + printf("test failure 7\n"); + return r; + } + + r = testFOR(); + if (r) { + printf("test failure 8\n"); + return r; + } +#endif +#ifdef __AVX2__ + r = testbabyavx(); + if (r) { + printf("test failure baby avx\n"); + return r; + } + + r = testavx2(); + if (r) { + printf("test failure 9 avx\n"); + return r; + } +#endif + r = test(); + if (r) { + printf("test failure 9\n"); + return r; + } + + r = test_simdmaxbitsd1_length(); + if (r) { + printf("test failure 10\n"); + return r; + } +#ifdef __SSE4_1__ + r = test_simdpackedsearch(); + if (r) { + printf("test failure 11\n"); + return r; + } + + r = test_simdpackedsearch_advanced(); + if (r) { + printf("test failure 12\n"); + return r; + } + + r = test_simdpackedselect(); + if (r) { + printf("test failure 13\n"); + return r; + } + + r = test_simdpackedselect_advanced(); + if (r) { + printf("test failure 14\n"); + return r; + } +#endif + printf("All tests OK!\n"); + + return 0; +} diff --git a/tests/unit_chars.c b/tests/unit_chars.c new file mode 100644 index 0000000..85421f2 --- /dev/null +++ b/tests/unit_chars.c @@ -0,0 +1,96 @@ +/** + * This code is released under a BSD License. + */ +#include "simdcomp.h" +#include +#include +#include + +#define get_random_char() (uint8_t)(rand() % 256); + +int main() { + int N = 5000 * SIMDBlockSize, gap; + __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t *datain = malloc(N * sizeof(uint32_t)); + uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + + srand(time(NULL)); + + for (gap = 1; gap <= 387420489; gap *= 3) { + int k; + printf(" gap = %u \n", gap); + + /* simulate some random character string, don't care about endiannes */ + for (k = 0; k < N; ++k) { + uint8_t _tmp[4]; + + _tmp[0] = get_random_char(); + _tmp[1] = get_random_char(); + _tmp[2] = get_random_char(); + _tmp[3] = get_random_char(); + + memmove(&datain[k], _tmp, 4); + } + for (k = 0; k * SIMDBlockSize < N; ++k) { + /* + First part works for general arrays (sorted or unsorted) + */ + int j; + /* we compute the bit width */ + const uint32_t b = maxbits(datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b 128-bit vectors at "buffer" */ + simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at + * backbuffer */ + simdunpack(buffer, backbuffer, b); /* uncompressed */ + for (j = 0; j < SIMDBlockSize; ++j) { + uint8_t chars_back[4]; + uint8_t chars_in[4]; + + memmove(chars_back, &backbuffer[j], 4); + memmove(chars_in, &datain[k * SIMDBlockSize + j], 4); + + if (chars_in[0] != chars_back[0] || chars_in[1] != chars_back[1] || + chars_in[2] != chars_back[2] || chars_in[3] != chars_back[3]) { + printf("bug in simdpack\n"); + return -2; + } + } + + { + /* + next part assumes that the data is sorted (uses differential coding) + */ + uint32_t offset = 0; + /* we compute the bit width */ + const uint32_t b1 = simdmaxbitsd1(offset, datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b1 128-bit vectors at "buffer" */ + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, b1); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at + * backbuffer */ + simdunpackd1(offset, buffer, backbuffer, b1); + for (j = 0; j < SIMDBlockSize; ++j) { + uint8_t chars_back[4]; + uint8_t chars_in[4]; + + memmove(chars_back, &backbuffer[j], 4); + memmove(chars_in, &datain[k * SIMDBlockSize + j], 4); + + if (chars_in[0] != chars_back[0] || chars_in[1] != chars_back[1] || + chars_in[2] != chars_back[2] || chars_in[3] != chars_back[3]) { + printf("bug in simdpack\n"); + return -3; + } + } + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +}