mirror of
https://github.com/danog/libtgvoip.git
synced 2025-01-10 06:38:22 +01:00
765 lines
23 KiB
C++
765 lines
23 KiB
C++
|
/*
|
||
|
Copyright (c) 2015 Christopher A. Taylor. All rights reserved.
|
||
|
|
||
|
Redistribution and use in source and binary forms, with or without
|
||
|
modification, are permitted provided that the following conditions are met:
|
||
|
|
||
|
* Redistributions of source code must retain the above copyright notice,
|
||
|
this list of conditions and the following disclaimer.
|
||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||
|
this list of conditions and the following disclaimer in the documentation
|
||
|
and/or other materials provided with the distribution.
|
||
|
* Neither the name of CM256 nor the names of its contributors may be
|
||
|
used to endorse or promote products derived from this software without
|
||
|
specific prior written permission.
|
||
|
|
||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||
|
POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
|
||
|
#include "gf256.h"
|
||
|
|
||
|
const uint8_t gf256_ctx::GF256_GEN_POLY[GF256_GEN_POLY_COUNT] = {
|
||
|
0x8e, 0x95, 0x96, 0xa6, 0xaf, 0xb1, 0xb2, 0xb4,
|
||
|
0xb8, 0xc3, 0xc6, 0xd4, 0xe1, 0xe7, 0xf3, 0xfa,
|
||
|
};
|
||
|
|
||
|
gf256_ctx::gf256_ctx() :
|
||
|
initialized(false)
|
||
|
{
|
||
|
gf256_init_();
|
||
|
}
|
||
|
|
||
|
gf256_ctx::~gf256_ctx()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
// Select which polynomial to use
|
||
|
void gf256_ctx::gf255_poly_init(int polynomialIndex)
|
||
|
{
|
||
|
if (polynomialIndex < 0 || polynomialIndex >= GF256_GEN_POLY_COUNT)
|
||
|
{
|
||
|
polynomialIndex = 0;
|
||
|
}
|
||
|
|
||
|
Polynomial = (GF256_GEN_POLY[polynomialIndex] << 1) | 1;
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Exponential and Log Tables
|
||
|
|
||
|
// Construct EXP and LOG tables from polynomial
|
||
|
void gf256_ctx::gf256_explog_init()
|
||
|
{
|
||
|
unsigned poly = Polynomial;
|
||
|
uint8_t* exptab = GF256_EXP_TABLE;
|
||
|
uint16_t* logtab = GF256_LOG_TABLE;
|
||
|
|
||
|
logtab[0] = 512;
|
||
|
exptab[0] = 1;
|
||
|
for (unsigned jj = 1; jj < 255; ++jj)
|
||
|
{
|
||
|
unsigned next = (unsigned)exptab[jj - 1] * 2;
|
||
|
if (next >= 256) next ^= poly;
|
||
|
|
||
|
exptab[jj] = static_cast<uint8_t>( next );
|
||
|
logtab[exptab[jj]] = static_cast<uint16_t>( jj );
|
||
|
}
|
||
|
|
||
|
exptab[255] = exptab[0];
|
||
|
logtab[exptab[255]] = 255;
|
||
|
|
||
|
for (unsigned jj = 256; jj < 2 * 255; ++jj)
|
||
|
{
|
||
|
exptab[jj] = exptab[jj % 255];
|
||
|
}
|
||
|
|
||
|
exptab[2 * 255] = 1;
|
||
|
|
||
|
for (unsigned jj = 2 * 255 + 1; jj < 4 * 255; ++jj)
|
||
|
{
|
||
|
exptab[jj] = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Multiply and Divide Tables
|
||
|
|
||
|
// Initialize MUL and DIV tables using LOG and EXP tables
|
||
|
void gf256_ctx::gf256_muldiv_init()
|
||
|
{
|
||
|
// Allocate table memory 65KB x 2
|
||
|
uint8_t* m = GF256_MUL_TABLE;
|
||
|
uint8_t* d = GF256_DIV_TABLE;
|
||
|
|
||
|
// Unroll y = 0 subtable
|
||
|
for (int x = 0; x < 256; ++x)
|
||
|
{
|
||
|
m[x] = d[x] = 0;
|
||
|
}
|
||
|
|
||
|
// For each other y value,
|
||
|
for (int y = 1; y < 256; ++y)
|
||
|
{
|
||
|
// Calculate log(y) for mult and 255 - log(y) for div
|
||
|
const uint8_t log_y = static_cast<uint8_t>(GF256_LOG_TABLE[y]);
|
||
|
const uint8_t log_yn = 255 - log_y;
|
||
|
|
||
|
// Next subtable
|
||
|
m += 256;
|
||
|
d += 256;
|
||
|
|
||
|
// Unroll x = 0
|
||
|
m[0] = 0;
|
||
|
d[0] = 0;
|
||
|
|
||
|
// Calculate x * y, x / y
|
||
|
for (int x = 1; x < 256; ++x)
|
||
|
{
|
||
|
uint16_t log_x = GF256_LOG_TABLE[x];
|
||
|
|
||
|
m[x] = GF256_EXP_TABLE[log_x + log_y];
|
||
|
d[x] = GF256_EXP_TABLE[log_x + log_yn];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Inverse Table
|
||
|
|
||
|
// Initialize INV table using DIV table
|
||
|
void gf256_ctx::gf256_inv_init()
|
||
|
{
|
||
|
for (int x = 0; x < 256; ++x)
|
||
|
{
|
||
|
GF256_INV_TABLE[x] = gf256_div(1, static_cast<uint8_t>(x));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Multiply and Add Memory Tables
|
||
|
|
||
|
/*
|
||
|
Fast algorithm to compute m[1..8] = a[1..8] * b in GF(256)
|
||
|
using SSE3 SIMD instruction set:
|
||
|
|
||
|
Consider z = x * y in GF(256).
|
||
|
This operation can be performed bit-by-bit. Usefully, the partial product
|
||
|
of each bit is combined linearly with the rest. This means that the 8-bit
|
||
|
number x can be split into its high and low 4 bits, and partial products
|
||
|
can be formed from each half. Then the halves can be linearly combined:
|
||
|
|
||
|
z = x[0..3] * y + x[4..7] * y
|
||
|
|
||
|
The multiplication of each half can be done efficiently via table lookups,
|
||
|
and the addition in GF(256) is XOR. There must be two tables that map 16
|
||
|
input elements for the low or high 4 bits of x to the two partial products.
|
||
|
Each value for y has a different set of two tables:
|
||
|
|
||
|
z = TABLE_LO_y(x[0..3]) xor TABLE_HI_y(x[4..7])
|
||
|
|
||
|
This means that we need 16 * 2 * 256 = 8192 bytes for precomputed tables.
|
||
|
|
||
|
Computing z[] = x[] * y can be performed 16 bytes at a time by using the
|
||
|
128-bit register operations supported by modern processors.
|
||
|
|
||
|
This is efficiently realized in SSE3 using the _mm_shuffle_epi8() function
|
||
|
provided by Visual Studio 2010 or newer in <tmmintrin.h>. This function
|
||
|
uses the low bits to do a table lookup on each byte. Unfortunately the
|
||
|
high bit of each mask byte has the special feature that it clears the
|
||
|
output byte when it is set, so we need to make sure it's cleared by masking
|
||
|
off the high bit of each byte before using it:
|
||
|
|
||
|
clr_mask = _mm_set1_epi8(0x0f) = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||
|
|
||
|
For the low half of the partial product, clear the high bit of each byte
|
||
|
and perform the table lookup:
|
||
|
|
||
|
p_lo = _mm_and_si128(x, clr_mask)
|
||
|
p_lo = _mm_shuffle_epi8(p_lo, TABLE_LO_y)
|
||
|
|
||
|
For the high half of the partial product, shift the high 4 bits of each
|
||
|
byte into the low 4 bits and clear the high bit of each byte, and then
|
||
|
perform the table lookup:
|
||
|
|
||
|
p_hi = _mm_srli_epi64(x, 4)
|
||
|
p_hi = _mm_and_si128(p_hi, clr_mask)
|
||
|
p_hi = _mm_shuffle_epi8(p_hi, TABLE_HI_y)
|
||
|
|
||
|
Finally add the two partial products to form the product, recalling that
|
||
|
addition is XOR in a Galois field:
|
||
|
|
||
|
result = _mm_xor_si128(p_lo, p_hi)
|
||
|
|
||
|
This crunches 16 bytes of x at a time, and the result can be stored in z.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
Intrinsic reference:
|
||
|
|
||
|
SSE3, VS2010+, tmmintrin.h:
|
||
|
|
||
|
GF256_M128 _mm_shuffle_epi8(GF256_M128 a, GF256_M128 mask);
|
||
|
Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pshufb. This instruction shuffles 16-byte parameters from a 128-bit parameter.
|
||
|
|
||
|
Pseudo-code for PSHUFB (with 128 bit operands):
|
||
|
|
||
|
for i = 0 to 15 {
|
||
|
if (SRC[(i * 8)+7] = 1 ) then
|
||
|
DEST[(i*8)+7..(i*8)+0] <- 0;
|
||
|
else
|
||
|
index[3..0] <- SRC[(i*8)+3 .. (i*8)+0];
|
||
|
DEST[(i*8)+7..(i*8)+0] <- DEST[(index*8+7)..(index*8+0)];
|
||
|
endif
|
||
|
}
|
||
|
|
||
|
SSE2, VS2008+, emmintrin.h:
|
||
|
|
||
|
GF256_M128 _mm_slli_epi64 (GF256_M128 a, int count);
|
||
|
Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while shifting in zeros.
|
||
|
GF256_M128 _mm_srli_epi64 (GF256_M128 a, int count);
|
||
|
Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros.
|
||
|
GF256_M128 _mm_set1_epi8 (char b);
|
||
|
Sets the 16 signed 8-bit integer values to b.
|
||
|
GF256_M128 _mm_and_si128 (GF256_M128 a, GF256_M128 b);
|
||
|
Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.
|
||
|
GF256_M128 _mm_xor_si128 ( GF256_M128 a, GF256_M128 b);
|
||
|
Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.
|
||
|
*/
|
||
|
|
||
|
// Initialize the MM256 tables using gf256_mul()
|
||
|
void gf256_ctx::gf256_muladd_mem_init()
|
||
|
{
|
||
|
for (int y = 0; y < 256; ++y)
|
||
|
{
|
||
|
uint8_t lo[16], hi[16];
|
||
|
|
||
|
// TABLE_LO_Y maps 0..15 to 8-bit partial product based on y.
|
||
|
for (unsigned char x = 0; x < 16; ++x)
|
||
|
{
|
||
|
lo[x] = gf256_mul(x, static_cast<uint8_t>( y ));
|
||
|
hi[x] = gf256_mul(x << 4, static_cast<uint8_t>( y ));
|
||
|
}
|
||
|
|
||
|
const GF256_M128 table_lo = _mm_set_epi8(
|
||
|
lo[15], lo[14], lo[13], lo[12], lo[11], lo[10], lo[9], lo[8],
|
||
|
lo[7], lo[6], lo[5], lo[4], lo[3], lo[2], lo[1], lo[0]);
|
||
|
const GF256_M128 table_hi = _mm_set_epi8(
|
||
|
hi[15], hi[14], hi[13], hi[12], hi[11], hi[10], hi[9], hi[8],
|
||
|
hi[7], hi[6], hi[5], hi[4], hi[3], hi[2], hi[1], hi[0]);
|
||
|
_mm_store_si128(MM256_TABLE_LO_Y + y, table_lo);
|
||
|
_mm_store_si128(MM256_TABLE_HI_Y + y, table_hi);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Initialization
|
||
|
//
|
||
|
// Initialize a context, filling in the tables.
|
||
|
//
|
||
|
// Thread-safety / Usage Notes:
|
||
|
//
|
||
|
// It is perfectly safe and encouraged to use a gf256_ctx object from multiple
|
||
|
// threads. The gf256_init() is relatively expensive and should only be done
|
||
|
// once, though it will take less than a millisecond.
|
||
|
//
|
||
|
// The gf256_ctx object must be aligned to 16 byte boundary.
|
||
|
// Simply tag the object with GF256_ALIGNED to achieve this.
|
||
|
//
|
||
|
// Example:
|
||
|
// static GF256_ALIGNED gf256_ctx TheGF256Context;
|
||
|
// gf256_init(&TheGF256Context, 0);
|
||
|
//
|
||
|
// Returns 0 on success and other values on failure.
|
||
|
|
||
|
int gf256_ctx::gf256_init_()
|
||
|
{
|
||
|
// Avoid multiple initialization
|
||
|
if (initialized)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
if (!IsLittleEndian())
|
||
|
{
|
||
|
fprintf(stderr, "gf256_ctx::gf256_init_: Little Endian architecture expected (code won't work without mods)\n");
|
||
|
return -2;
|
||
|
}
|
||
|
|
||
|
gf255_poly_init(DefaultPolynomialIndex);
|
||
|
gf256_explog_init();
|
||
|
gf256_muldiv_init();
|
||
|
gf256_inv_init();
|
||
|
gf256_muladd_mem_init();
|
||
|
|
||
|
initialized = true;
|
||
|
fprintf(stderr, "gf256_ctx::gf256_init_: initialized\n");
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Operations with context
|
||
|
|
||
|
void gf256_ctx::gf256_mul_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, uint8_t y, int bytes)
|
||
|
{
|
||
|
// Use a single if-statement to handle special cases
|
||
|
if (y <= 1)
|
||
|
{
|
||
|
if (y == 0)
|
||
|
{
|
||
|
memset(vz, 0, bytes);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Partial product tables; see above
|
||
|
const GF256_M128 table_lo_y = _mm_load_si128(MM256_TABLE_LO_Y + y);
|
||
|
const GF256_M128 table_hi_y = _mm_load_si128(MM256_TABLE_HI_Y + y);
|
||
|
|
||
|
// clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||
|
const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||
|
|
||
|
GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
||
|
const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
||
|
|
||
|
// Handle multiples of 16 bytes
|
||
|
while (bytes >= 16)
|
||
|
{
|
||
|
// See above comments for details
|
||
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
||
|
GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
|
||
|
x0 = _mm_srli_epi64(x0, 4);
|
||
|
GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
|
||
|
l0 = _mm_shuffle_epi8(table_lo_y, l0);
|
||
|
h0 = _mm_shuffle_epi8(table_hi_y, h0);
|
||
|
_mm_storeu_si128(z16, _mm_xor_si128(l0, h0));
|
||
|
|
||
|
x16++;
|
||
|
z16++;
|
||
|
bytes -= 16;
|
||
|
}
|
||
|
|
||
|
uint8_t * GF256_RESTRICT z8 = reinterpret_cast<uint8_t*>(z16);
|
||
|
const uint8_t * GF256_RESTRICT x8 = reinterpret_cast<const uint8_t*>(x16);
|
||
|
const uint8_t * GF256_RESTRICT table = GF256_MUL_TABLE + ((unsigned)y << 8);
|
||
|
|
||
|
// Handle a block of 8 bytes
|
||
|
if (bytes >= 8)
|
||
|
{
|
||
|
uint64_t word = table[x8[0]];
|
||
|
word |= (uint64_t)table[x8[1]] << 8;
|
||
|
word |= (uint64_t)table[x8[2]] << 16;
|
||
|
word |= (uint64_t)table[x8[3]] << 24;
|
||
|
word |= (uint64_t)table[x8[4]] << 32;
|
||
|
word |= (uint64_t)table[x8[5]] << 40;
|
||
|
word |= (uint64_t)table[x8[6]] << 48;
|
||
|
word |= (uint64_t)table[x8[7]] << 56;
|
||
|
*(uint64_t*)z8 = word;
|
||
|
|
||
|
x8 += 8;
|
||
|
z8 += 8;
|
||
|
bytes -= 8;
|
||
|
}
|
||
|
|
||
|
// Handle a block of 4 bytes
|
||
|
if (bytes >= 4)
|
||
|
{
|
||
|
uint32_t word = table[x8[0]];
|
||
|
word |= (uint32_t)table[x8[1]] << 8;
|
||
|
word |= (uint32_t)table[x8[2]] << 16;
|
||
|
word |= (uint32_t)table[x8[3]] << 24;
|
||
|
*(uint32_t*)z8 = word;
|
||
|
|
||
|
x8 += 4;
|
||
|
z8 += 4;
|
||
|
bytes -= 4;
|
||
|
}
|
||
|
|
||
|
// Handle single bytes
|
||
|
for (int i = bytes; i > 0; i--) {
|
||
|
z8[i-1] = table[x8[i-1]];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void gf256_ctx::gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y, const void * GF256_RESTRICT vx, int bytes)
|
||
|
{
|
||
|
// Use a single if-statement to handle special cases
|
||
|
if (y <= 1)
|
||
|
{
|
||
|
if (y == 1)
|
||
|
{
|
||
|
gf256_add_mem(vz, vx, bytes);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Partial product tables; see above
|
||
|
const GF256_M128 table_lo_y = _mm_load_si128(MM256_TABLE_LO_Y + y);
|
||
|
const GF256_M128 table_hi_y = _mm_load_si128(MM256_TABLE_HI_Y + y);
|
||
|
|
||
|
// clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||
|
const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||
|
|
||
|
GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
||
|
const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
||
|
|
||
|
// Handle multiples of 16 bytes
|
||
|
while (bytes >= 16)
|
||
|
{
|
||
|
// See above comments for details
|
||
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
||
|
GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
|
||
|
x0 = _mm_srli_epi64(x0, 4);
|
||
|
GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
|
||
|
l0 = _mm_shuffle_epi8(table_lo_y, l0);
|
||
|
h0 = _mm_shuffle_epi8(table_hi_y, h0);
|
||
|
const GF256_M128 p0 = _mm_xor_si128(l0, h0);
|
||
|
const GF256_M128 z0 = _mm_loadu_si128(z16);
|
||
|
_mm_storeu_si128(z16, _mm_xor_si128(p0, z0));
|
||
|
|
||
|
x16++;
|
||
|
z16++;
|
||
|
bytes -= 16;
|
||
|
}
|
||
|
|
||
|
uint8_t * GF256_RESTRICT z8 = reinterpret_cast<uint8_t*>(z16);
|
||
|
const uint8_t * GF256_RESTRICT x8 = reinterpret_cast<const uint8_t*>(x16);
|
||
|
const uint8_t * GF256_RESTRICT table = GF256_MUL_TABLE + ((unsigned)y << 8);
|
||
|
|
||
|
// Handle a block of 8 bytes
|
||
|
if (bytes >= 8)
|
||
|
{
|
||
|
uint64_t word = table[x8[0]];
|
||
|
word |= (uint64_t)table[x8[1]] << 8;
|
||
|
word |= (uint64_t)table[x8[2]] << 16;
|
||
|
word |= (uint64_t)table[x8[3]] << 24;
|
||
|
word |= (uint64_t)table[x8[4]] << 32;
|
||
|
word |= (uint64_t)table[x8[5]] << 40;
|
||
|
word |= (uint64_t)table[x8[6]] << 48;
|
||
|
word |= (uint64_t)table[x8[7]] << 56;
|
||
|
*(uint64_t*)z8 ^= word;
|
||
|
|
||
|
x8 += 8;
|
||
|
z8 += 8;
|
||
|
bytes -= 8;
|
||
|
}
|
||
|
|
||
|
// Handle a block of 4 bytes
|
||
|
if (bytes >= 4)
|
||
|
{
|
||
|
uint32_t word = table[x8[0]];
|
||
|
word |= (uint32_t)table[x8[1]] << 8;
|
||
|
word |= (uint32_t)table[x8[2]] << 16;
|
||
|
word |= (uint32_t)table[x8[3]] << 24;
|
||
|
*(uint32_t*)z8 ^= word;
|
||
|
|
||
|
x8 += 4;
|
||
|
z8 += 4;
|
||
|
bytes -= 4;
|
||
|
}
|
||
|
|
||
|
// Handle single bytes
|
||
|
for (int i = bytes; i > 0; i--) {
|
||
|
z8[i-1] ^= table[x8[i-1]];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Static operations
|
||
|
|
||
|
void gf256_ctx::gf256_add_mem(void * GF256_RESTRICT vx, const void * GF256_RESTRICT vy, int bytes)
|
||
|
{
|
||
|
GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<GF256_M128*>(vx);
|
||
|
const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
|
||
|
|
||
|
// Handle multiples of 64 bytes
|
||
|
while (bytes >= 64)
|
||
|
{
|
||
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
||
|
GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
|
||
|
GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
|
||
|
GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
|
||
|
GF256_M128 y0 = _mm_loadu_si128(y16);
|
||
|
GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
|
||
|
GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
|
||
|
GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
|
||
|
|
||
|
_mm_storeu_si128(x16,
|
||
|
_mm_xor_si128(x0, y0));
|
||
|
_mm_storeu_si128(x16 + 1,
|
||
|
_mm_xor_si128(x1, y1));
|
||
|
_mm_storeu_si128(x16 + 2,
|
||
|
_mm_xor_si128(x2, y2));
|
||
|
_mm_storeu_si128(x16 + 3,
|
||
|
_mm_xor_si128(x3, y3));
|
||
|
|
||
|
x16 += 4;
|
||
|
y16 += 4;
|
||
|
bytes -= 64;
|
||
|
}
|
||
|
|
||
|
// Handle multiples of 16 bytes
|
||
|
while (bytes >= 16)
|
||
|
{
|
||
|
// x[i] = x[i] xor y[i]
|
||
|
_mm_storeu_si128(x16,
|
||
|
_mm_xor_si128(
|
||
|
_mm_loadu_si128(x16),
|
||
|
_mm_loadu_si128(y16)));
|
||
|
|
||
|
x16++;
|
||
|
y16++;
|
||
|
bytes -= 16;
|
||
|
}
|
||
|
|
||
|
uint8_t * GF256_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
|
||
|
const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
|
||
|
|
||
|
// Handle a block of 8 bytes
|
||
|
if (bytes >= 8)
|
||
|
{
|
||
|
uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
|
||
|
const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
|
||
|
*x8 ^= *y8;
|
||
|
|
||
|
x1 += 8;
|
||
|
y1 += 8;
|
||
|
bytes -= 8;
|
||
|
}
|
||
|
|
||
|
// Handle a block of 4 bytes
|
||
|
if (bytes >= 4)
|
||
|
{
|
||
|
uint32_t * GF256_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1);
|
||
|
const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1);
|
||
|
*x4 ^= *y4;
|
||
|
|
||
|
x1 += 4;
|
||
|
y1 += 4;
|
||
|
bytes -= 4;
|
||
|
}
|
||
|
|
||
|
// Handle final bytes
|
||
|
for (int i = bytes; i > 0; i--) {
|
||
|
x1[i-1] ^= y1[i-1];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void gf256_ctx::gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, const void * GF256_RESTRICT vy, int bytes)
|
||
|
{
|
||
|
GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
||
|
const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
||
|
const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
|
||
|
|
||
|
// Handle multiples of 16 bytes
|
||
|
while (bytes >= 16)
|
||
|
{
|
||
|
// z[i] = x[i] xor y[i]
|
||
|
_mm_storeu_si128(z16,
|
||
|
_mm_xor_si128(
|
||
|
_mm_loadu_si128(z16),
|
||
|
_mm_xor_si128(
|
||
|
_mm_loadu_si128(x16),
|
||
|
_mm_loadu_si128(y16))));
|
||
|
|
||
|
x16++;
|
||
|
y16++;
|
||
|
z16++;
|
||
|
bytes -= 16;
|
||
|
}
|
||
|
|
||
|
uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t *>(z16);
|
||
|
const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t *>(x16);
|
||
|
const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
|
||
|
|
||
|
// Handle a block of 8 bytes
|
||
|
if (bytes >= 8)
|
||
|
{
|
||
|
uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
|
||
|
const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x1);
|
||
|
const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
|
||
|
*z8 ^= *x8 ^ *y8;
|
||
|
|
||
|
x1 += 8;
|
||
|
y1 += 8;
|
||
|
z1 += 8;
|
||
|
bytes -= 8;
|
||
|
}
|
||
|
|
||
|
// Handle a block of 4 bytes
|
||
|
if (bytes >= 4)
|
||
|
{
|
||
|
uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1);
|
||
|
const uint32_t * GF256_RESTRICT x4 = reinterpret_cast<const uint32_t *>(x1);
|
||
|
const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1);
|
||
|
*z4 ^= *x4 ^ *y4;
|
||
|
|
||
|
x1 += 4;
|
||
|
y1 += 4;
|
||
|
z1 += 4;
|
||
|
bytes -= 4;
|
||
|
}
|
||
|
|
||
|
// Handle final bytes
|
||
|
for (int i = bytes; i > 0; i--) {
|
||
|
z1[i-1] ^= x1[i-1] ^ y1[i-1];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void gf256_ctx::gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, const void * GF256_RESTRICT vy, int bytes)
|
||
|
{
|
||
|
GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
||
|
const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
||
|
const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
|
||
|
|
||
|
// Handle multiples of 64 bytes
|
||
|
while (bytes >= 64)
|
||
|
{
|
||
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
||
|
GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
|
||
|
GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
|
||
|
GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
|
||
|
GF256_M128 y0 = _mm_loadu_si128(y16);
|
||
|
GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
|
||
|
GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
|
||
|
GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
|
||
|
|
||
|
_mm_storeu_si128(z16, _mm_xor_si128(x0, y0));
|
||
|
_mm_storeu_si128(z16 + 1, _mm_xor_si128(x1, y1));
|
||
|
_mm_storeu_si128(z16 + 2, _mm_xor_si128(x2, y2));
|
||
|
_mm_storeu_si128(z16 + 3, _mm_xor_si128(x3, y3));
|
||
|
|
||
|
x16 += 4;
|
||
|
y16 += 4;
|
||
|
z16 += 4;
|
||
|
bytes -= 64;
|
||
|
}
|
||
|
|
||
|
// Handle multiples of 16 bytes
|
||
|
while (bytes >= 16)
|
||
|
{
|
||
|
// z[i] = x[i] xor y[i]
|
||
|
_mm_storeu_si128(z16,
|
||
|
_mm_xor_si128(
|
||
|
_mm_loadu_si128(x16),
|
||
|
_mm_loadu_si128(y16)));
|
||
|
|
||
|
x16++;
|
||
|
y16++;
|
||
|
z16++;
|
||
|
bytes -= 16;
|
||
|
}
|
||
|
|
||
|
uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t *>(z16);
|
||
|
const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t *>(x16);
|
||
|
const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
|
||
|
|
||
|
// Handle a block of 8 bytes
|
||
|
if (bytes >= 8)
|
||
|
{
|
||
|
uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
|
||
|
const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x1);
|
||
|
const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
|
||
|
*z8 = *x8 ^ *y8;
|
||
|
|
||
|
x1 += 8;
|
||
|
y1 += 8;
|
||
|
z1 += 8;
|
||
|
bytes -= 8;
|
||
|
}
|
||
|
|
||
|
// Handle a block of 4 bytes
|
||
|
if (bytes >= 4)
|
||
|
{
|
||
|
uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1);
|
||
|
const uint32_t * GF256_RESTRICT x4 = reinterpret_cast<const uint32_t *>(x1);
|
||
|
const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1);
|
||
|
*z4 = *x4 ^ *y4;
|
||
|
|
||
|
x1 += 4;
|
||
|
y1 += 4;
|
||
|
z1 += 4;
|
||
|
bytes -= 4;
|
||
|
}
|
||
|
|
||
|
// Handle final bytes
|
||
|
for (int i = bytes; i > 0; i--) {
|
||
|
z1[i-1] = x1[i-1] ^ y1[i-1];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes)
|
||
|
{
|
||
|
GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<GF256_M128*>(vx);
|
||
|
GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<GF256_M128*>(vy);
|
||
|
|
||
|
// Handle blocks of 16 bytes
|
||
|
while (bytes >= 16)
|
||
|
{
|
||
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
||
|
GF256_M128 y0 = _mm_loadu_si128(y16);
|
||
|
_mm_storeu_si128(x16, y0);
|
||
|
_mm_storeu_si128(y16, x0);
|
||
|
|
||
|
bytes -= 16;
|
||
|
++x16;
|
||
|
++y16;
|
||
|
}
|
||
|
|
||
|
uint8_t * GF256_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
|
||
|
uint8_t * GF256_RESTRICT y1 = reinterpret_cast<uint8_t *>(y16);
|
||
|
|
||
|
// Handle a block of 8 bytes
|
||
|
if (bytes >= 8)
|
||
|
{
|
||
|
uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
|
||
|
uint64_t * GF256_RESTRICT y8 = reinterpret_cast<uint64_t *>(y1);
|
||
|
|
||
|
uint64_t temp = *x8;
|
||
|
*x8 = *y8;
|
||
|
*y8 = temp;
|
||
|
|
||
|
x1 += 8;
|
||
|
y1 += 8;
|
||
|
bytes -= 8;
|
||
|
}
|
||
|
|
||
|
// Handle a block of 4 bytes
|
||
|
if (bytes >= 4)
|
||
|
{
|
||
|
uint32_t * GF256_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1);
|
||
|
uint32_t * GF256_RESTRICT y4 = reinterpret_cast<uint32_t *>(y1);
|
||
|
|
||
|
uint32_t temp = *x4;
|
||
|
*x4 = *y4;
|
||
|
*y4 = temp;
|
||
|
|
||
|
x1 += 4;
|
||
|
y1 += 4;
|
||
|
bytes -= 4;
|
||
|
}
|
||
|
|
||
|
// Handle final bytes
|
||
|
uint8_t temp;
|
||
|
|
||
|
for (int i = bytes; i > 0; i--) {
|
||
|
temp = x1[i-1]; x1[i-1] = y1[i-1]; y1[i-1] = temp;
|
||
|
}
|
||
|
}
|