/*
ms.cu
Counting number of magic squares up to rotations,
reflections and M tranformations.
Simultaneously counting semi-magic squares.
Compilation :
nvcc -O3 -arch=sm_60 -maxrregcount=40 \
-Wno-deprecated-declarations ms.cu -lcrypto
To omit md5 checksum, add a -DnoMD5 option. Then you may drop
-Wnodeprecated-declarations and -lcypto.
-DV option produces verbose outputs for N=6.
Command line options:
% ./a.out <1stRowNo> <1stRowPattern> <2ndRowNo> <2ndRowPattern>
example : ./a.out 00000 e00000007 13745 180800250
See also : https://magicsquare6.net/
Updated on 2023/09/20.
Authored by Hidetoshi Mino hidetoshimino@gmail.com .
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <assert.h>
#ifndef noMD5
#include <openssl/md5.h>
#endif
#include <vector>
using namespace std;
#ifndef N // order of square : 3..6
#define N 6
#endif
#ifndef NB_SM // Number of blocks per SM
#define NB_SM 16
#endif
#ifndef NW // Number of Warps per block
#define NW 3
#endif
#ifndef NSMPL
#define NSMPL 1
#endif
#ifndef MDIAG
#define MDIAG 24
#endif
#ifndef CINTLIM
#define CINTLIM 26
#endif
typedef long long unsigned u64;
#define NN (N*N)
#define AllBits ( ( 1ULL << NN ) - 1 )
#define AllLanes (0xffffffff)
#define MAX( x, y ) ( (x) > (y) ? (x) : (y) )
#define CU_SAFE(stmt) \
do { \
cudaError_t status = stmt; \
if (status != cudaSuccess) { \
printf("ERROR: %s in %s:%d.\n", cudaGetErrorString(status), \
__FILE__, __LINE__); \
exit( 1 ); \
} \
} while (0)
#define WS 32 /* Warp size */
#define RUWS( x ) ( ( ( x ) + ( WS - 1 ) ) & ~( WS - 1 ) ) /* Round up to WS */
#define RDWS( x ) ( ( x ) & ~( WS - 1 ) ) /* Round down to WS */
__device__ __host__ int topbit( u64 v ) {
#ifdef __CUDA_ARCH__
return 63 - __clzll( v );
#else
v = v & ( ~( v >> 1 ) );
float f32 = (float) v;
int pos = *( (int *) &f32 );
pos =( pos >> 23 ) - 0x7f;
return pos;
#endif
/*
int pos = 0;
int v32 = v >> 32;
if( v32 ) {
pos = 32;
} else {
v32 = (int) v;
}
if( v32 & 0xffff0000UL ) { pos +=16; v32 >>= 16; }
if( v32 & 0x0000ff00UL ) { pos += 8; v32 >>= 8; }
if( v32 & 0x000000f0UL ) { pos += 4; v32 >>= 4; }
if( v32 & 0x0000000cUL ) { pos += 2; v32 >>= 2; }
if( v32 & 0000000002UL ) pos++;
return pos;
*/
}
__device__ __host__ u64 reverse64( u64 x ) {
#ifdef __CUDA_ARCH__
return __brevll( x );
#else
u64 y;
y = ( ( x & 0x5555555555555555ULL ) << 1 );
x = ( y | ( ( x >> 1 ) & 0x5555555555555555ULL ) );
y = ( ( x & 0x3333333333333333ULL ) << 2 );
x = ( y | ( ( x >> 2 ) & 0x3333333333333333ULL ) );
y = ( ( x & 0x0F0F0F0F0F0F0F0FULL ) << 4 );
x = ( y | ( ( x >> 4 ) & 0x0F0F0F0F0F0F0F0FULL ) );
y = ( ( x & 0x00FF00FF00FF00FFULL ) << 8 );
x = ( y | ( ( x >> 8 ) & 0x00FF00FF00FF00FFULL ) );
y = ( ( x & 0x0000FFFF0000FFFFULL ) << 16 );
x = ( y | ( ( x >> 16 ) & 0x0000FFFF0000FFFFULL ) );
y = ( ( x & 0x00000000FFFFFFFFULL ) << 32 );
x = ( y | ( ( x >> 32 ) & 0x00000000FFFFFFFFULL ) );
return x;
#endif
}
int check_topbit( ) {
for( int i = 0; i < NN; i ++ ) {
// printf( "%d %09llx %d\n", i, 1ULL << i, topbit( 1ULL << i ) );
if( topbit( 1ULL << i ) != i ) return 1;
}
for( int i = 1; i < NN; i ++ ) {
// printf( "%d %09llx %d\n"
// , i, ( 1ULL << i ) - 1, topbit( ( 1ULL << i ) - 1 ) );
if( topbit( ( 1ULL << i ) - 1 ) != i - 1 ) return 1;
}
fprintf( stderr, "topbit() works correctly.\n" );
return 0;
}
__device__ __host__ int vcsize_f( int ncols, int order )
{
return RUWS( ncols ) * ( order - 2 ) + WS;
}
__device__ __host__ int vxsize_f( int ndiags, int order )
{
return RUWS( ndiags ) * ( order - 1 ) + WS;
}
/* = Host globals ======================= */
#define MVECS 32768
int nvecs = 0;
u64 vecs[ MVECS ];
/* = Constant memory ==================== */
__constant__ u64 v_max_c;
__constant__ u64 rev_max_c;
__constant__ u64 v_second_c;
__constant__ int single_c;
__constant__ int nrowset_c;
__constant__ int ncols_c, ndiags_c;
__constant__ int vcsize_c, vxsize_c;
#define MDPAIR ( MDIAG * MDIAG * MDIAG / 5 + ( WS - 1 ) * MDIAG )
__constant__ int p_offset_c[ MDIAG + 2 ];
__constant__ int p_idx_c[ MDPAIR ];
/* = Block __shared__ ==================== */
//__shared__ u64 which_rowv[ NW ][ NN ];
__shared__ int which_row[ NW ][ NN ];
__shared__ u64 which_colv[ NW ][ NN ];
// __shared__ int which_col[ NW ][ NN ];
__shared__ u64 row_vecs[ NW ][ N ];
__shared__ u64 col_vecs[ NW ][ N ];
/* = Managed ( Unified ) =================== */
__managed__ int i_rowset_m = 0;
__managed__ u64 count_m = 0ULL;
__managed__ u64 sm_count_m = 0ULL;
/* ========================================= */
void Count( int max_row, int second_row, const vector<int> & dev_blocks );
void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res );
__global__ void RowSet( u64* row_set_d, u64* vc_g, u64* vx_g );
__device__ u64 MakeCols( int level, u64 rem,
u64* const & vcb, const int & nvc,
u64* const & vxb, const int & nvx, int single, int & sm_count );
__device__ void LastCols( u64 rem,
u64* const & vcb, const int & nvn, const int & nvcmn,
u64* const & vxb, const int & nvx, int single,
u64 & sub_count, int & sm_count );
__device__ int MakeDiagC( u64* const & vxb, const int & nvx );
__device__ int MakeDiagS( u64* const & vxb, const int & nvx );
__device__ int FinalCheck(
int* const &e2rowv, u64* const &e2col, const u64 & vdx, const u64 & vcx );
int Cross( int v1, int v2 ) {
u64 v = vecs[v1] & vecs[v2];
return v && ( ! ( v & ( v - 1 ) ) );
}
__device__ __host__ u64 Reverse( u64 v )
{
return reverse64( v ) >> (64 - NN);
}
void GenVecs( int max_ent, int rem_sum, int level, u64 vp )
{
if ( level >= 0 ) {
for ( int ent = max_ent; ent > 0 ; --ent ) {
u64 v = 1ULL << ( ent - 1 );
if ( rem_sum >= ent )
GenVecs( ent - 1, rem_sum - ent, level - 1, vp | v );
}
}
else {
if ( rem_sum == 0 ) {
vecs[ nvecs ++ ] = vp;
}
}
if( nvecs > MVECS ) {
fprintf( stderr, "nvecs > MVECS\n" );
exit(1);
}
}
void Count( int max_row, int second_row, const vector<int> & dev_blocks )
{
u64 v_max = vecs[ max_row ];
u64 v_second = vecs[ second_row ];
u64 rem = AllBits ^ v_max ^ v_second;
u64 must_bit = ( 1ULL << topbit( rem ) );
vector<u64> rows; rows.reserve( nvecs );
vector<u64> cols; cols.reserve( nvecs );
vector<u64> diags; diags.reserve( nvecs );
int n3rd = 0;
for( int i = second_row + 1; i < nvecs; ++ i )
if( ( ! ( v_max & vecs[ i ] ) ) && ( ! ( v_second & vecs[ i ] ) )
&& v_max >= Reverse( vecs[ i ] ) )
{
rows.push_back( vecs[ i ] );
if( vecs[ i ] & must_bit ) n3rd ++;
}
int nrows = rows.size();
vector<u64> row_set; row_set.reserve( n3rd * 3000 * ( N - 2 ) );
u64 vr[ nrows * ( N - 2 ) ];
for( int i = 0; i < nrows; i ++ ) vr[ i ] = rows[ i ];
fprintf( stderr, "nrows %d n3rd %d \n", nrows, n3rd );
MakeRows( N - 2, 0, rem, vr, vr + nrows, row_set );
fprintf( stderr, "row_set.size() %lu \n", row_set.size() );
for( int i = max_row + 1; i < nvecs; ++ i )
if( Cross( max_row, i ) && Cross( second_row, i )
&& v_max >= Reverse( vecs[ i ] ) )
{
cols.push_back( vecs[ i ] );
}
int ncols = cols.size();
for( int i = 0; i < nvecs; ++ i )
if( Cross( max_row, i ) && Cross( second_row, i ) )
diags.push_back( vecs[ i ] );
int ndiags = diags.size();
/* constant parameters for the __global__ RowSet() */
u64 rev_max = Reverse( v_max );
int single = rev_max == v_max || rev_max == v_second;
int nrowset = row_set.size();
int vcsize = vcsize_f( ncols, N );
int vxsize = vxsize_f( ndiags, N );
// Device Memory Allocation -------------------
int ndev = dev_blocks.size();
u64* row_set_d[ ndev ];
u64* vc_d[ ndev ];
u64* vx_d[ ndev ];
i_rowset_m = 0; // initialize managed variable
// count_m = 0ULL; // Don't initialize here.
for( int idev = 0 ; idev < ndev; idev ++ ) {
CU_SAFE( cudaSetDevice( idev ) );
int nblocks = dev_blocks[ idev ];
CU_SAFE( cudaMalloc( row_set_d + idev, sizeof( u64 ) * row_set.size() ) );
CU_SAFE( cudaMemcpy( row_set_d[ idev ], row_set.data(),
sizeof( u64 ) * row_set.size(), cudaMemcpyHostToDevice ) );
CU_SAFE( cudaMalloc( vc_d + idev,
sizeof(u64) * ( RUWS( ncols ) + vcsize * nblocks * NW ) ) );
CU_SAFE( cudaMalloc( vx_d + idev,
sizeof(u64) * ( RUWS( ndiags ) + vxsize * nblocks * NW ) ) );
CU_SAFE( cudaMemcpy( vc_d[ idev ], cols.data(), ncols*sizeof(u64),
cudaMemcpyHostToDevice ) );
CU_SAFE( cudaMemcpy( vx_d[ idev ], diags.data(), ndiags*sizeof(u64),
cudaMemcpyHostToDevice ) );
// constant memory setup --------------------
CU_SAFE( cudaMemcpyToSymbol( v_max_c, &v_max, sizeof( u64 ) ) );
CU_SAFE( cudaMemcpyToSymbol( v_second_c, &v_second, sizeof( u64 ) ) );
CU_SAFE( cudaMemcpyToSymbol( rev_max_c, &rev_max, sizeof( u64 ) ) );
CU_SAFE( cudaMemcpyToSymbol( single_c, &single, sizeof( int ) ) );
CU_SAFE( cudaMemcpyToSymbol( nrowset_c, &nrowset, sizeof( int ) ) );
CU_SAFE( cudaMemcpyToSymbol( ncols_c, &ncols, sizeof( int ) ) );
CU_SAFE( cudaMemcpyToSymbol( ndiags_c, &ndiags, sizeof( int ) ) );
CU_SAFE( cudaMemcpyToSymbol( vcsize_c, &vcsize, sizeof( int ) ) );
CU_SAFE( cudaMemcpyToSymbol( vxsize_c, &vxsize, sizeof( int ) ) );
// ------------------------------------------
RowSet <<< nblocks, dim3( WS, NW ) >>> (
row_set_d[ idev ], vc_d[ idev ], vx_d[ idev ] );
}
for( int idev = 0 ; idev < ndev; idev ++ ) {
CU_SAFE( cudaSetDevice( idev ) );
// cudaDeviceSynchronize();
// Device Memory Deallocation ----------------
CU_SAFE( cudaFree( vx_d[ idev ] ) );
CU_SAFE( cudaFree( vc_d[ idev ] ) );
CU_SAFE( cudaFree( row_set_d[ idev ] ) );
}
return;
}
void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector<u64> & res )
{
static u64 row_vecs[ N ];
u64 must_bit = ( 1ULL << topbit( rem ) );
for( ; vrb < vre; vrb += NSMPL ) {
u64 row_vec = *vrb;
if( ! ( row_vec & must_bit ) ) break;
row_vecs[ l ] = row_vec;
if( m > l + 1 ) {
u64* vrp = vrb + 1;
u64* vrq = vre;
while( vrp < vre ) {
*vrq = *vrp;
vrq += ( row_vec & *vrp ) ? 0 : 1;
vrp ++;
}
MakeRows( m, l + 1, rem ^ row_vec, vre, vrq, res );
} else {
for( int i = 0; i < m; i ++ ) {
res.push_back( row_vecs[ i ] );
}
}
}
}
__device__ int gather_para( u64* const & vp, const int & bv, const int & ev,
const u64 & x, u64* const & vpn )
{
int j = 0;
for( int i = RDWS( bv ); i < ev; i += WS ) {
int i_t = i + threadIdx.x;
u64 v = -1ULL;
if( i_t >=bv and i_t < ev ) v = vp[ i_t ];
int p = ! ( v & x );
unsigned ballot = __ballot_sync( AllLanes, p );
unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
if( p )
vpn[ j + sub_sum ] = v;
j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
}
return j;
}
__device__ int cnext( u64* vp, int nv, int must_bit )
{
int j = 0;
for( int i = 0; i < nv; i += WS ) {
u64 v = 0ULL;
if( i + threadIdx.x < nv ) v = vp[ i + threadIdx.x ];
j += __popc( __ballot_sync( AllLanes, v >> must_bit ) );
if( j != i + WS ) break;
}
return j;
}
__device__ int gather_cross(
u64* const & vp, const int & nv, const u64 & x, u64* const & vpn )
{
int j = 0;
for( int i = 0; i < nv; i += WS ) {
int i_t = i + threadIdx.x;
u64 v = 0ULL;
if( i_t < nv ) v = vp[ i_t ];
int p = __popcll( v & x ) == 1;
unsigned ballot = __ballot_sync( AllLanes, p );
unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
if( p )
vpn[ j + sub_sum ] = v;
j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
}
return j;
}
__device__ int gather_double_cross( u64* const & vp, const int & nv,
const u64 & x1, const u64 & x2, u64* const & vpn )
{
int j = 0;
for( int i = 0; i < nv; i += WS ) {
int i_t = i + threadIdx.x;
u64 v = 0ULL;
if( i_t < nv ) v = vp[ i_t ];
int p = __popcll( v & x1 ) == 1 and __popcll( v & x2 ) == 1;
unsigned ballot = __ballot_sync( AllLanes, p );
unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
if( p )
vpn[ j + sub_sum ] = v;
j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
}
return j;
}
__device__ int gather_triple_cross( u64* const & vp, const int & nv,
const u64 & x1, const u64 & x2, const u64 & x3, u64* const & vpn )
{
int j = 0;
for( int i = 0; i < nv; i += WS ) {
int i_t = i + threadIdx.x;
u64 v = 0ULL;
if( i_t < nv ) v = vp[ i_t ];
int p = __popcll( v & x1 ) == 1
and __popcll( v & x2 ) == 1
and __popcll( v & x3 ) == 1;
unsigned ballot = __ballot_sync( AllLanes, p );
unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
if( p )
vpn[ j + sub_sum ] = v;
j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
}
return j;
}
__device__ void set_vec( u64 vecs[ ], int index[ ],
const u64 & v, const int & level )
{
vecs[ level ] = v;
if( ( v >> threadIdx.x ) & 1 ) {
index[ NN-1 - threadIdx.x ] = level;
}
if( ( ( v >> WS ) >> threadIdx.x ) & 1 ) {
index[ NN-1 - WS - threadIdx.x ] = level;
}
__syncwarp();
}
__device__ void set_vecv( u64 vecs[ ], u64 indexv[ ],
const u64 & v, const int & level )
{
vecs[ level ] = v;
if( ( v >> threadIdx.x ) & 1 ) {
indexv[ NN-1 - threadIdx.x ] = v;
}
if( ( ( v >> WS ) >> threadIdx.x ) & 1 ) {
indexv[ NN-1 - WS - threadIdx.x ] = v;
}
__syncwarp();
}
__global__ void RowSet( u64* rowset, u64* vc_g, u64* vx_g )
{
#define WHICH_ROW which_row[ threadIdx.y ]
#define WHICH_ROWV which_rowv[ threadIdx.y ]
#define ROW_VECS row_vecs[ threadIdx.y ]
set_vec( ROW_VECS, WHICH_ROW, v_max_c, N - 1 );
set_vec( ROW_VECS, WHICH_ROW, v_second_c, N - 2 );
__shared__ u64* vc_s[ NW ];
__shared__ u64* vx_s[ NW ];
#define VC vc_s[ threadIdx.y ]
#define VX vx_s[ threadIdx.y ]
VC = vc_g + RUWS( ncols_c ) + vcsize_c * ( blockIdx.x * NW + threadIdx.y );
VX = vx_g + RUWS( ndiags_c ) + vxsize_c * ( blockIdx.x * NW + threadIdx.y );
VC[ vcsize_c - WS ] = 0ULL;
VX[ vxsize_c - WS ] = 0ULL;
u64 count = 0ULL;
u64 sm_count = 0ULL;
for( ; /* i_rowset < nrowset_c */; /* i_rowset += N - 2 */ ) {
__shared__ int i_rowset[ NW ];
#define I_ROWSET i_rowset[ threadIdx.y ]
if( threadIdx.x == 0 ) {
I_ROWSET = atomicAdd_system( &i_rowset_m, N - 2 );
}
__syncwarp();
//i_rowset = __shfl_sync( AllLanes, i_rowset, 0 );
if( I_ROWSET >= nrowset_c ) break;
__shared__ int single[ NW ];
#define SINGLE single[ threadIdx.y ]
SINGLE = single_c;
for( int i = 0; i < N - 2; i ++ ) {
u64 row_vec = rowset[ I_ROWSET + i ];
set_vec( ROW_VECS, WHICH_ROW, row_vec, N - 3 - i );
SINGLE = SINGLE or rev_max_c == row_vec;
}
__shared__ int kvc_s[ NW ];
__shared__ int kvx_s[ NW ];
#define KVC kvc_s[ threadIdx.y ]
#define KVX kvx_s[ threadIdx.y ]
#if N == 6
KVC = gather_triple_cross( vc_g, ncols_c,
ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VC );
KVX = gather_triple_cross( vx_g, ndiags_c,
ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], ROW_VECS[ N - 5 ], VX );
#endif
#if N == 5
KVC = gather_double_cross( vc_g, ncols_c,
ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VC );
KVX = gather_double_cross( vx_g, ndiags_c,
ROW_VECS[ N - 3 ], ROW_VECS[ N - 4 ], VX );
#endif
#if N == 4
KVC = gather_cross( vc_g, ncols_c, ROW_VECS[ N - 3 ], VC );
KVX = gather_cross( vx_g, ndiags_c, ROW_VECS[ N - 3 ], VX );
#endif
#if N == 3
for( int i = 0; i < ncols_c; i++ ) VC[ i ] = vc_g[ i ];
for( int i = 0; i < ndiags_c; i++ ) VX[ i ] = vx_g[ i ];
KVC = ncols_c;
KVX = ndiags_c;
#endif
for( int iw = 0; iw < KVX; iw += WS ) {
if( iw + threadIdx.x < KVX ) {
u64 v = VX[ iw + threadIdx.x ];
VX[ iw + threadIdx.x ] |=
u64 ( ( __clzll ( v & ROW_VECS[ 0 ] ) + NN - 64 )
| ( ( __clzll ( v & ROW_VECS[ 1 ] ) + NN - 64 ) << 8 )
| ( ( __clzll ( v & ROW_VECS[ 2 ] ) + NN - 64 ) << 16 )
) << 40;
}
}
// if( KVC < N or KVX < 2 ) continue;
#if N > 3
// 53 / 51 ?
int sm_count_cols = 0;
count +=
MakeCols( N - 4, AllBits, VC, KVC, VX, KVX, SINGLE, sm_count_cols ) ;
assert( ( sm_count_cols >> CINTLIM ) == 0 );
sm_count += sm_count_cols;
/*
for( int span = ( WS >> 1 ); span > 0; span >>= 1 ) {
count_cols += __shfl_xor_sync( AllLanes, count_cols, span );
}
*/
#else
u64 sub_count = 0;
int sm_count_cols = 0;
int nvn = cnext( VC, KVC, NN - 1 );
LastCols( AllBits, VC, nv, KVC - nvn, VX, KVX, SINGLE,
sub_count, sm_count_cols );
count += sub_count;
sm_count += sm_count_cols;
#endif
#if N > 5
#ifdef V
if( blockIdx.x == 0 and threadIdx.x == 0 )
printf( "r3:%09llx %9llu %9llu\n", ROW_VECS[ N - 3 ], count, sm_count );
#endif
#endif
}
if ( VC[ vcsize_c - WS ] != 0ULL ||
VX[ vxsize_c - WS ] != 0ULL ) {
printf( " Array over run ! \n");
assert( 0 );
// exit( 1 );
}
for( int span = ( WS >> 1 ); span > 0; span >>= 1 ) {
count += __shfl_xor_sync( AllLanes, count, span );
}
if( threadIdx.x == 0 )
{
atomicAdd_system( &count_m, count );
atomicAdd_system( &sm_count_m, sm_count );
}
}
#if N > 3
__device__ u64 MakeCols( int lvl, u64 rem,
u64* const & vcb, const int & nvc,
u64* const & vxb, const int & nvx, int single, int & sm_count)
{
__shared__ u64 rem_s[ NW ][ N - 3 ];
__shared__ u64* vcb_s[ NW ][ N - 3 ];
__shared__ int nvc_s[ NW ][ N - 3 ];
__shared__ int nvn_s[ NW ][ N - 3 ];
__shared__ u64* vxb_s[ NW ][ N - 3 ];
__shared__ int nvx_s[ NW ][ N - 3 ];
__shared__ int single_s[ NW ][ N - 3 ];
#define REM_S rem_s[ threadIdx.y ]
#define VCB_S vcb_s[ threadIdx.y ]
#define NVC_S nvc_s[ threadIdx.y ]
#define NVN_S nvn_s[ threadIdx.y ]
#define VXB_S vxb_s[ threadIdx.y ]
#define NVX_S nvx_s[ threadIdx.y ]
#define SINGLE_S single_s[ threadIdx.y ]
REM_S[ lvl ] = rem;
VCB_S[ lvl ] = vcb;
NVC_S[ lvl ] = nvc;
NVN_S[ lvl ] = cnext( vcb, nvc, topbit( rem ) );
VXB_S[ lvl ] = vxb;
NVX_S[ lvl ] = nvx;
SINGLE_S[ lvl ] = single;
__shared__ int ivc_s[ NW ][ N - 3 ];
#define IVC_S ivc_s[ threadIdx.y ]
u64 sub_count = 0;
// int sm_count = 0;
Init:
IVC_S[ lvl ] = 0;
// for( ; ivc_s[ lvl ] < nvn_s[ lvl ]; ivc_s[ lvl ] ++ ) {
Loop:
if( IVC_S[ lvl ] >= NVN_S[ lvl ] ) {
if( lvl == N - 4 )
return sub_count;
else {
lvl ++;
goto ReInit;
}
}
{ // Scope limit for locals.
__shared__ u64 col_vec[ NW ];
#define COL_VEC col_vec[ threadIdx.y ]
COL_VEC = VCB_S[ lvl ][ IVC_S[ lvl ] ];
__shared__ u64* vxbn[ NW ];
#define VXBN vxbn[ threadIdx.y ]
VXBN = RUWS( NVX_S[ lvl ] ) + VXB_S[ lvl ];
__shared__ int kvx[ NW ];
#undef KVX
#define KVX kvx[ threadIdx.y ]
KVX = gather_cross( VXB_S[ lvl ], NVX_S[ lvl ], COL_VEC, VXBN );
// if ( KVX < 2 ) goto ReInit; // continue;
__shared__ u64* vcbn[ NW ];
#define VCBN vcbn[ threadIdx.y ]
VCBN = RUWS( NVC_S[ lvl ] ) + VCB_S[ lvl ];
__shared__ int kvc[ NW ];
#undef KVC
#define KVC kvc[ threadIdx.y ]
KVC = gather_para( VCB_S[ lvl ], NVN_S[ lvl ],
NVC_S[ lvl ], COL_VEC, VCBN );
__shared__ int kvn[ NW ];
#define KVN kvn[ threadIdx.y ]
KVN = cnext( VCBN, KVC, topbit( REM_S[ lvl ] ^ COL_VEC ) );
if( KVN < 0 or KVC - KVN < lvl + 2 ) goto ReInit; // continue;
// ^^^^^^^!!
set_vecv(
col_vecs[threadIdx.y], which_colv[threadIdx.y], COL_VEC, lvl + 3 );
// ^^^^^^^!!
if( lvl > 0 ) {
REM_S[ lvl - 1 ] = REM_S[ lvl ] ^ COL_VEC;
VCB_S[ lvl - 1 ] = VCBN;
NVC_S[ lvl - 1 ] = KVC;
NVN_S[ lvl - 1 ] = KVN;
VXB_S[ lvl - 1 ] = VXBN;
NVX_S[ lvl - 1 ] = KVX;
SINGLE_S[ lvl - 1 ] = SINGLE_S[ lvl ] || ( rev_max_c == COL_VEC );
lvl --;
goto Init;
} else { // 48 / 59
LastCols( REM_S[ lvl ] ^ COL_VEC, VCBN, KVN, KVC-KVN,
VXBN, KVX, SINGLE_S[ lvl ] || ( rev_max_c == COL_VEC ),
sub_count, sm_count );
}
} // Scope limit for locals.
ReInit:
IVC_S[ lvl ] ++;
goto Loop;
}
#endif // N > 3
__device__ void LastCols( u64 rem,
u64* const & vcb, const int & nvn, const int & nvcmn,
u64* const & vxb, const int & nvx, int single,
u64 & sub_count, int & sm_count )
{
__shared__ int c2_s[ NW ];
#define C2 c2_s[ threadIdx.y ]
for( C2 = 0; C2 < nvn; C2 ++ ) {
u64 v2 = vcb[ C2 ];
set_vecv( col_vecs[threadIdx.y], which_colv[threadIdx.y], v2, 2 );
// int single2 = ( single << 1 ) | ( rev_max_c == v2 );
single = ( single << 1 ) | ( rev_max_c == v2 );
int nvn1 = cnext( vcb + nvn, nvcmn, topbit( rem ^ v2 ) ); // 2 / 51
for( int c1 = 0; c1 < nvn1; c1 ++ ) {
u64 v1 = vcb[ nvn + c1 ];
if( ! ( v2 & v1 ) ) {
u64 v0 = rem ^ v1 ^ v2;
if( ( ! ( v0 & 1 ) ) or v_max_c >= Reverse( v0 ) ) {
// 1.5 / 51
set_vecv( col_vecs[threadIdx.y], which_colv[threadIdx.y], v1, 1 );
set_vecv( col_vecs[threadIdx.y], which_colv[threadIdx.y], v0, 0 );
u64* vxbn = RUWS( nvx ) + vxb;
int kvx = gather_double_cross( vxb, nvx, v2, v1, vxbn ); // 6 / 51
int shift =
( ( single || rev_max_c == v1 || rev_max_c == v0 ) ? 0 : 1 );
// avr( kvx ) : 8.6, 27% : > 10, 0.5% : > 20, 0.0005% : > 30
// 25 / 59
while( kvx > MDIAG ) {
sub_count += MakeDiagS( vxbn, kvx ) << shift;
vxbn ++;
kvx --;
}
sub_count += MakeDiagC( vxbn, kvx ) << shift;
sm_count += ( 1 << shift );
}
}
}
single >>= 1;
}
return;
}
__device__ int MakeDiagS( u64* const & vxb, const int & nvx )
{
int diag_count = 0;
u64 vdx = *vxb;
for( int ic = 1; ic < nvx; ic += WS ) {
u64 vcx = -1ULL;
if( ic + threadIdx.x < nvx ) vcx = vxb[ ic + threadIdx.x ];
#if N%2
if( __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1 )
// if( __popcll( vdx & vcx ) == 1 )
#else
if( ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 0 )
// if( ( vdx & vcx ) == 0 )
#endif
{
diag_count += // 18 / 59
FinalCheck( which_row[ threadIdx.y ], which_colv[ threadIdx.y ],
vdx, vcx );
}
}
return diag_count;
}
__device__ int MakeDiagC( u64* const & vx, const int & vxn )
{
int diag_count = 0;
for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) {
if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) {
int idx = p_idx_c[ ivx + threadIdx.x ];
u64 vdx = vx[ idx >> 16 ];
u64 vcx = vx[ idx & 0xffff ];
#if N%2
if( __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1 )
#else
if( ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 0 )
#endif
{
diag_count += // 18 / 59
FinalCheck( which_row[ threadIdx.y ], which_colv[ threadIdx.y ],
vdx, vcx );
}
}
}
return diag_count;
}
__device__ int FinalCheck(
int* const &e2row, u64* const &e2colv, const u64 & vdx, const u64 & vcx )
{
int i;
i = e2row[ __clzll( vdx & e2colv[ ( vcx >> 40 ) & 0xff ] ) + NN-64 ];
if( e2row[ __clzll( vcx & e2colv[ ( vdx >> 40 ) & 0xff ] ) + NN-64 ] != i )
return 0;
i = 40 + ( ( i & 1 ) + 1 ) * 8;
// i = ( i & 1 ) + 1;
return ( e2row[ __clzll( vdx & e2colv[ ( vcx >> i ) & 0xff ] ) + NN-64 ]
==
e2row[ __clzll( vcx & e2colv[ ( vdx >> i ) & 0xff ] ) + NN-64 ]
) ? 1 : 0;
}
#ifndef noMD5
void MakeMD5( u64 max_p, u64 second_p, u64 count, char md5str[] )
{
MD5_CTX md5ctx;
unsigned char md5bin[ MD5_DIGEST_LENGTH ];
const char* sample = "e00000007 1c0000038 494199069345";
int msg_len = strlen( sample );
char msg[ msg_len + 1 ];
sprintf( msg, "%09llx %09llx %15llu", max_p, second_p, count );
MD5_Init( &md5ctx );
MD5_Update( &md5ctx, msg, msg_len );
MD5_Final( md5bin, &md5ctx );
for( int i = 0; i < MD5_DIGEST_LENGTH; i ++ )
sprintf( md5str + ( i * 2 ), "%02x", md5bin[ i ] );
md5str[ MD5_DIGEST_LENGTH * 2 ] = 0;
}
#endif
int main( int argc, char* argv[] )
{
fprintf( stderr, "%s\n", __FILE__ );
fprintf( stderr, "Size of long long unsigned : %lu \n", sizeof( u64 ) );
if( sizeof( u64 ) * 8 < NN ) {
fprintf( stderr,
"Size of long long is too small : %lu \n", sizeof( u64 ) );
exit( 1 );
}
if( check_topbit( ) ) {
fprintf( stderr, "topbit() does't work correctly on this platform.\n" );
exit( 1 );
}
GenVecs( NN, ( ( NN + 1 ) * N ) / 2, N - 1, 0ULL );
fprintf( stderr, "nvecs %d\n", nvecs );
/* pair indexes ----------------------------------- */
int p_offset[ MDIAG + 2 ];
int p_idx[ MDPAIR ];
int ioff = 0;
for( int ndiag = 0; ndiag < MDIAG + 1; ndiag ++ ) {
p_offset[ ndiag ] = ioff;
for( int idx = 0; idx < ndiag - 1; idx ++ ) {
for( int icx = idx + 1; icx < ndiag; icx ++ ) {
p_idx[ ioff ++ ] = ( idx << 16 ) | icx;
}
}
}
p_offset[ MDIAG + 1 ] = ioff;
fprintf( stderr, "size of p_idx_c %d \n", ioff );
if( ioff >= MDPAIR ) {
fprintf(stderr, "p_idx_c index out of ragne. %d >= %d\n", ioff, MDPAIR );
exit( 1 );
}
/* ------------------------------------------------ */
// Get device properties
// Set a part of constant memory
//
int ndev;
CU_SAFE( cudaGetDeviceCount( &ndev ) );
vector<int> dev_blocks( ndev );
for( int idev = 0 ; idev < ndev; idev ++ ) {
CU_SAFE( cudaSetDevice( idev ) );
cudaDeviceProp prop;
CU_SAFE( cudaGetDeviceProperties( &prop, idev ) );
int nblocks = prop.multiProcessorCount * NB_SM;
dev_blocks[ idev ] = nblocks;
fprintf( stderr, "DevId:%2d #SM:%d, #Blocks:%d %s\n",
idev, prop.multiProcessorCount, nblocks, prop.name);
CU_SAFE( cudaMemcpyToSymbol(
p_offset_c, p_offset, sizeof( int ) * ( MDIAG + 2 ) ) );
CU_SAFE( cudaMemcpyToSymbol( p_idx_c, p_idx, sizeof( int ) * ioff ) );
}
//
u64 row1P = strtoll( argv[ 2 ], NULL, 16 );
u64 row2P = strtoll( argv[ 4 ], NULL, 16 );
fprintf( stderr, "r1:%09llx r2:%09llx\n", row1P, row2P );
u64 total_count = 0ULL;
u64 total_sm_count = 0ULL;
for( int max_row = 0; max_row < nvecs; ++ max_row ) {
u64 max_rowP = vecs[ max_row ];
if( row1P != 0ULL and row1P < max_rowP ) continue;
if( max_rowP < row1P ) break;
if( ! ( max_rowP & 1ULL << ( NN - 1 ) ) ) break;
if( max_rowP < Reverse( max_rowP ) ) continue;
for( int second_row = max_row + 1; second_row < nvecs; ++ second_row ) {
u64 second_rowP = vecs[ second_row ];
if( row2P != 0ULL and row2P < second_rowP ) continue;
if( second_rowP < row2P ) break;
if( max_rowP & second_rowP ) continue;
if( max_rowP < Reverse( second_rowP ) ) continue;
u64 rem = AllBits ^ max_rowP ^ second_rowP;
if( rem > second_rowP ) break;
count_m = 0ULL;
sm_count_m = 0ULL;
Count( max_row, second_row, dev_blocks );
printf( "%05d %09llx ", max_row, vecs[ max_row ] );
printf( "%05d %09llx ", second_row, vecs[ second_row ] );
printf( "%15llu ", count_m );
#ifndef noMD5
char md5str[ MD5_DIGEST_LENGTH * 2 + 1 ];
MakeMD5( vecs[ max_row ], vecs[ second_row ], count_m, md5str );
printf( "%s", md5str );
#endif
printf( "\n");
printf( "%05d %09llx ", max_row, vecs[ max_row ] );
printf( "%05d %09llx ", second_row, vecs[ second_row ] );
printf( "S %13llu ", sm_count_m );
#ifndef noMD5
MakeMD5( vecs[ max_row ], vecs[ second_row ], sm_count_m, md5str );
printf( "%s", md5str );
#endif
printf( "\n");
total_count += count_m;
total_sm_count += sm_count_m;
}
}
if( row2P == 0ULL ) {
fprintf( stderr, "Total: %17llu\n", total_count );
fprintf( stderr, "Total_SM:%15llu\n", total_sm_count );
}
return 0;
}