ms-20230131.cu
Differences
This shows you the differences between two versions of the page.
| Next revision | Previous revision | ||
| ms-20230131.cu [2023/03/03 18:10] – created mino | ms-20230131.cu [2023/03/12 10:55] (current) – mino | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| - | abc | + | == An execution example on GeForce RTX-4090 == |
| + | |||
| + | $ nvcc -O3 -arch=sm_60 -maxrregcount=40 ms-20230131.cu -lcrypto | ||
| + | $ time ./a.out 00000 e00000007 13745 180800250 | ||
| + | ms-20230131.cu | ||
| + | Size of long long unsigned : 8 | ||
| + | topbit() works correctly. | ||
| + | nvecs 32134 | ||
| + | size of p_idx_c 2300 | ||
| + | DevId: 0 #SM:128, # | ||
| + | r1: | ||
| + | nrows 3116 n3rd 711 | ||
| + | row_set.size() 5956644 | ||
| + | 00000 e00000007 13745 180800250 | ||
| + | |||
| + | real 2m57.464s | ||
| + | user 2m57.277s | ||
| + | sys | ||
| + | |||
| + | |||
| + | <file cpp ms-20230131.cu> | ||
| + | /* | ||
| + | ms.cu | ||
| + | |||
| + | Counting number of magic squares up to rotations, | ||
| + | reflections and M tranformations. | ||
| + | |||
| + | Compilation : | ||
| + | |||
| + | nvcc -O3 -arch=sm_60 -maxrregcount=40 \ | ||
| + | | ||
| + | |||
| + | To omit md5 checksum, add a -DnoMD5 option. Then you may drop | ||
| + | -Wnodeprecated-declarations and -lcrypto. | ||
| + | |||
| + | -DV option produces verbose outputs for N=6. | ||
| + | |||
| + | Command line options: | ||
| + | |||
| + | % ./ | ||
| + | example : ./a.out 00000 e00000007 13745 180800250 | ||
| + | |||
| + | See also : https:// | ||
| + | |||
| + | Updated on 2023/ | ||
| + | Authored by Hidetoshi Mino hidetoshimino@gmail.com . | ||
| + | |||
| + | */ | ||
| + | #include < | ||
| + | #include < | ||
| + | #include < | ||
| + | #include < | ||
| + | #include < | ||
| + | #ifndef noMD5 | ||
| + | #include < | ||
| + | #endif | ||
| + | #include < | ||
| + | using namespace std; | ||
| + | |||
| + | #ifndef N // order of square : 3..6 | ||
| + | #define N 6 | ||
| + | #endif | ||
| + | |||
| + | #ifndef NB_SM // Number of blocks per SM | ||
| + | #define NB_SM 16 | ||
| + | #endif | ||
| + | |||
| + | #ifndef NW // Number of Warps per block | ||
| + | #define NW 3 | ||
| + | #endif | ||
| + | |||
| + | #ifndef NSMPL | ||
| + | #define NSMPL 1 | ||
| + | #endif | ||
| + | |||
| + | #ifndef MDIAG | ||
| + | #define MDIAG 24 | ||
| + | #endif | ||
| + | |||
| + | typedef long long unsigned u64; | ||
| + | |||
| + | #define NN (N*N) | ||
| + | #define AllBits ( ( 1ULL << NN ) - 1 ) | ||
| + | #define AllLanes (0xffffffff) | ||
| + | |||
| + | #define MAX( x, y ) ( (x) > (y) ? (x) : (y) ) | ||
| + | #define CU_SAFE(stmt) \ | ||
| + | do { \ | ||
| + | cudaError_t status = stmt; \ | ||
| + | if (status != cudaSuccess) { \ | ||
| + | | ||
| + | | ||
| + | exit( 1 ); \ | ||
| + | } \ | ||
| + | } while (0) | ||
| + | |||
| + | #define WS 32 /* Warp size */ | ||
| + | #define RUWS( x ) ( ( ( x ) + ( WS - 1 ) ) & ~( WS - 1 ) ) /* Round up to WS */ | ||
| + | #define RDWS( x ) ( ( x ) & ~( WS - 1 ) ) /* Round down to WS */ | ||
| + | |||
| + | __device__ __host__ int topbit( u64 v ) { | ||
| + | |||
| + | #ifdef __CUDA_ARCH__ | ||
| + | | ||
| + | #else | ||
| + | v = v & ( ~( v >> 1 ) ); | ||
| + | float f32 = (float) v; | ||
| + | int pos = *( (int *) &f32 ); | ||
| + | pos =( pos >> 23 ) - 0x7f; | ||
| + | | ||
| + | #endif | ||
| + | /* | ||
| + | int pos = 0; | ||
| + | int v32 = v >> 32; | ||
| + | if( v32 ) { | ||
| + | pos = 32; | ||
| + | } else { | ||
| + | v32 = (int) v; | ||
| + | } | ||
| + | if( v32 & 0xffff0000UL ) { pos +=16; v32 >>= 16; } | ||
| + | if( v32 & 0x0000ff00UL ) { pos += 8; v32 >> | ||
| + | if( v32 & 0x000000f0UL ) { pos += 4; v32 >> | ||
| + | if( v32 & 0x0000000cUL ) { pos += 2; v32 >> | ||
| + | if( v32 & 0000000002UL ) pos++; | ||
| + | return pos; | ||
| + | */ | ||
| + | } | ||
| + | |||
| + | __device__ __host__ u64 reverse64( u64 x ) { | ||
| + | |||
| + | #ifdef __CUDA_ARCH__ | ||
| + | |||
| + | | ||
| + | |||
| + | #else | ||
| + | u64 y; | ||
| + | y = ( ( x & 0x5555555555555555ULL ) << 1 ); | ||
| + | x = ( y | ( ( x >> 1 ) & 0x5555555555555555ULL ) ); | ||
| + | y = ( ( x & 0x3333333333333333ULL ) << 2 ); | ||
| + | x = ( y | ( ( x >> 2 ) & 0x3333333333333333ULL ) ); | ||
| + | y = ( ( x & 0x0F0F0F0F0F0F0F0FULL ) << 4 ); | ||
| + | x = ( y | ( ( x >> 4 ) & 0x0F0F0F0F0F0F0F0FULL ) ); | ||
| + | y = ( ( x & 0x00FF00FF00FF00FFULL ) << 8 ); | ||
| + | x = ( y | ( ( x >> 8 ) & 0x00FF00FF00FF00FFULL ) ); | ||
| + | y = ( ( x & 0x0000FFFF0000FFFFULL ) << 16 ); | ||
| + | x = ( y | ( ( x >> 16 ) & 0x0000FFFF0000FFFFULL ) ); | ||
| + | y = ( ( x & 0x00000000FFFFFFFFULL ) << 32 ); | ||
| + | x = ( y | ( ( x >> 32 ) & 0x00000000FFFFFFFFULL ) ); | ||
| + | | ||
| + | #endif | ||
| + | } | ||
| + | |||
| + | int check_topbit( ) { | ||
| + | |||
| + | for( int i = 0; i < NN; i ++ ) { | ||
| + | // printf( "%d %09llx %d\n", i, 1ULL << i, topbit( 1ULL << i ) ); | ||
| + | if( topbit( 1ULL << i ) != i ) return 1; | ||
| + | } | ||
| + | for( int i = 1; i < NN; i ++ ) { | ||
| + | // printf( "%d %09llx %d\n" | ||
| + | // , i, ( 1ULL << i ) - 1, topbit( ( 1ULL << i ) - 1 ) ); | ||
| + | if( topbit( ( 1ULL << i ) - 1 ) != i - 1 ) return 1; | ||
| + | } | ||
| + | | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ __host__ int vcsize_f( int ncols, int order ) | ||
| + | { | ||
| + | return RUWS( ncols ) * ( order - 2 ) + WS; | ||
| + | } | ||
| + | |||
| + | __device__ __host__ int vxsize_f( int ndiags, int order ) | ||
| + | { | ||
| + | return RUWS( ndiags ) * ( order - 1 ) + WS; | ||
| + | } | ||
| + | |||
| + | /* = Host globals ======================= */ | ||
| + | |||
| + | #define MVECS 32768 | ||
| + | int nvecs = 0; | ||
| + | u64 vecs[ MVECS ]; | ||
| + | |||
| + | /* = Constant memory ==================== */ | ||
| + | |||
| + | __constant__ u64 v_max_c; | ||
| + | __constant__ u64 rev_max_c; | ||
| + | __constant__ u64 v_second_c; | ||
| + | __constant__ int single_c; | ||
| + | __constant__ int nrowset_c; | ||
| + | __constant__ int ncols_c, ndiags_c; | ||
| + | __constant__ int vcsize_c, vxsize_c; | ||
| + | |||
| + | #define MDPAIR ( MDIAG * MDIAG * MDIAG / 5 + ( WS - 1 ) * MDIAG ) | ||
| + | |||
| + | __constant__ int p_offset_c[ MDIAG + 2 ]; | ||
| + | __constant__ int p_idx_c[ MDPAIR ]; | ||
| + | |||
| + | /* = Block __shared__ ==================== */ | ||
| + | |||
| + | __shared__ int which_row[ NW ][ 64 ]; | ||
| + | __shared__ int which_col[ NW ][ 64 ]; | ||
| + | __shared__ u64 row_vecs[ NW ][ N ]; | ||
| + | __shared__ u64 col_vecs[ NW ][ N ]; | ||
| + | |||
| + | /* = Managed ( Unified ) =================== */ | ||
| + | |||
| + | __managed__ int i_rowset_m = 0; | ||
| + | |||
| + | __managed__ u64 count_m = 0ULL; | ||
| + | |||
| + | /* ========================================= */ | ||
| + | |||
| + | void Count( int max_row, int second_row, const vector< | ||
| + | void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector< | ||
| + | |||
| + | __global__ | ||
| + | __device__ int MakeCols( int level, u64 rem, | ||
| + | u64* const & vcb, const int & nvc, | ||
| + | u64* const & vxb, const int & nvx, int single ); | ||
| + | __device__ void LastCols( u64 rem, | ||
| + | u64* const & vcb, const int & nvc, const int & nvn, | ||
| + | u64* const & vxb, const int & nvx, int single, int & sub_count ); | ||
| + | __device__ int MakeDiagC( u64* const & vxb, const int & nvx ); | ||
| + | __device__ int MakeDiagS( u64* const & vxb, const int & nvx ); | ||
| + | __device__ int FinalCheck( | ||
| + | int* const &e2row, int* const &e2col, u64* const &col_v, u64* const &row_v, | ||
| + | const u64 & vdx, const u64 & vcx ); | ||
| + | |||
| + | int Cross( int v1, int v2 ) { | ||
| + | u64 v = vecs[v1] & vecs[v2]; | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ __host__ u64 Reverse( u64 v ) | ||
| + | { | ||
| + | return reverse64( v ) >> (64 - NN); | ||
| + | } | ||
| + | |||
| + | void GenVecs( int max_ent, int rem_sum, int level, u64 vp ) | ||
| + | { | ||
| + | if ( level >= 0 ) { | ||
| + | for ( int ent = max_ent; ent > 0 ; --ent ) { | ||
| + | u64 v = 1ULL << ( ent - 1 ); | ||
| + | if ( rem_sum >= ent ) | ||
| + | GenVecs( ent - 1, rem_sum - ent, level - 1, vp | v ); | ||
| + | } | ||
| + | } | ||
| + | else { | ||
| + | if ( rem_sum == 0 ) { | ||
| + | vecs[ nvecs ++ ] = vp; | ||
| + | } | ||
| + | } | ||
| + | if( nvecs > MVECS ) { | ||
| + | fprintf( stderr, "nvecs > MVECS\n" | ||
| + | exit(1); | ||
| + | } | ||
| + | } | ||
| + | |||
| + | void Count( int max_row, int second_row, const vector< | ||
| + | { | ||
| + | u64 v_max = vecs[ max_row ]; | ||
| + | u64 v_second = vecs[ second_row ]; | ||
| + | u64 rem = AllBits ^ v_max ^ v_second; | ||
| + | u64 must_bit = ( 1ULL << | ||
| + | |||
| + | vector< | ||
| + | vector< | ||
| + | vector< | ||
| + | |||
| + | int n3rd = 0; | ||
| + | for( int i = second_row + 1; i < nvecs; ++ i ) | ||
| + | if( ( ! ( v_max & vecs[ i ] ) ) && | ||
| + | && v_max >= Reverse( vecs[ i ] ) ) | ||
| + | { | ||
| + | | ||
| + | if( vecs[ i ] & must_bit ) n3rd ++; | ||
| + | } | ||
| + | int nrows = rows.size(); | ||
| + | |||
| + | vector< | ||
| + | u64 vr[ nrows * ( N - 2 ) ]; | ||
| + | for( int i = 0; i < nrows; i ++ ) vr[ i ] = rows[ i ]; | ||
| + | fprintf( stderr, "nrows %d n3rd %d \n", nrows, n3rd ); | ||
| + | |||
| + | MakeRows( N - 2, 0, rem, vr, vr + nrows, row_set ); | ||
| + | fprintf( stderr, " | ||
| + | |||
| + | for( int i = max_row + 1; i < nvecs; ++ i ) | ||
| + | if( Cross( max_row, i ) && Cross( second_row, i ) | ||
| + | && v_max >= Reverse( vecs[ i ] ) ) | ||
| + | { | ||
| + | | ||
| + | } | ||
| + | int ncols = cols.size(); | ||
| + | |||
| + | for( int i = 0; i < nvecs; ++ i ) | ||
| + | if( Cross( max_row, i ) && Cross( second_row, i ) ) | ||
| + | | ||
| + | int ndiags = diags.size(); | ||
| + | |||
| + | /* constant parameters for the __global__ RowSet() | ||
| + | |||
| + | u64 rev_max = Reverse( v_max ); | ||
| + | int single = rev_max == v_max || rev_max == v_second; | ||
| + | int nrowset = row_set.size(); | ||
| + | int vcsize = vcsize_f( ncols, N ); | ||
| + | int vxsize = vxsize_f( ndiags, N ); | ||
| + | |||
| + | // Device Memory Allocation ------------------- | ||
| + | |||
| + | int ndev = dev_blocks.size(); | ||
| + | |||
| + | u64* row_set_d[ ndev ]; | ||
| + | u64* vc_d[ ndev ]; | ||
| + | u64* vx_d[ ndev ]; | ||
| + | |||
| + | i_rowset_m = 0; // initialize managed variable | ||
| + | // count_m = 0ULL; // Don't initialize here. | ||
| + | |||
| + | for( int idev = 0 ; idev < ndev; idev ++ ) { | ||
| + | |||
| + | CU_SAFE( cudaSetDevice( idev ) ); | ||
| + | |||
| + | int nblocks = dev_blocks[ idev ]; | ||
| + | |||
| + | CU_SAFE( cudaMalloc( row_set_d + idev, sizeof( u64 ) * row_set.size() ) ); | ||
| + | CU_SAFE( cudaMemcpy( row_set_d[ idev ], row_set.data(), | ||
| + | sizeof( u64 ) * row_set.size(), | ||
| + | |||
| + | CU_SAFE( cudaMalloc( vc_d + idev, | ||
| + | sizeof(u64) * ( RUWS( ncols ) + vcsize * nblocks * NW ) ) ); | ||
| + | CU_SAFE( cudaMalloc( vx_d + idev, | ||
| + | sizeof(u64) * ( RUWS( ndiags ) + vxsize * nblocks * NW ) ) ); | ||
| + | CU_SAFE( cudaMemcpy( vc_d[ idev ], cols.data(), | ||
| + | cudaMemcpyHostToDevice ) ); | ||
| + | CU_SAFE( cudaMemcpy( vx_d[ idev ], diags.data(), | ||
| + | cudaMemcpyHostToDevice ) ); | ||
| + | |||
| + | // constant memory setup -------------------- | ||
| + | |||
| + | CU_SAFE( cudaMemcpyToSymbol( v_max_c, &v_max, sizeof( u64 ) ) ); | ||
| + | CU_SAFE( cudaMemcpyToSymbol( v_second_c, & | ||
| + | CU_SAFE( cudaMemcpyToSymbol( rev_max_c, & | ||
| + | |||
| + | CU_SAFE( cudaMemcpyToSymbol( single_c, & | ||
| + | CU_SAFE( cudaMemcpyToSymbol( nrowset_c, & | ||
| + | |||
| + | CU_SAFE( cudaMemcpyToSymbol( ncols_c, &ncols, sizeof( int ) ) ); | ||
| + | CU_SAFE( cudaMemcpyToSymbol( ndiags_c, & | ||
| + | |||
| + | CU_SAFE( cudaMemcpyToSymbol( vcsize_c, & | ||
| + | CU_SAFE( cudaMemcpyToSymbol( vxsize_c, & | ||
| + | |||
| + | // ------------------------------------------ | ||
| + | |||
| + | RowSet <<< | ||
| + | row_set_d[ idev ], vc_d[ idev ], vx_d[ idev ] ); | ||
| + | } | ||
| + | |||
| + | for( int idev = 0 ; idev < ndev; idev ++ ) { | ||
| + | |||
| + | CU_SAFE( cudaSetDevice( idev ) ); | ||
| + | // cudaDeviceSynchronize(); | ||
| + | |||
| + | // Device Memory Deallocation ---------------- | ||
| + | |||
| + | CU_SAFE( cudaFree( vx_d[ idev ] ) ); | ||
| + | CU_SAFE( cudaFree( vc_d[ idev ] ) ); | ||
| + | CU_SAFE( cudaFree( row_set_d[ idev ] ) ); | ||
| + | } | ||
| + | |||
| + | return; | ||
| + | } | ||
| + | |||
| + | void MakeRows( int m, int l, u64 rem, u64* vrb, u64* vre, vector< | ||
| + | { | ||
| + | static u64 row_vecs[ N ]; | ||
| + | |||
| + | u64 must_bit = ( 1ULL << | ||
| + | |||
| + | for( ; vrb < vre; vrb += NSMPL ) { | ||
| + | |||
| + | u64 row_vec = *vrb; | ||
| + | if( ! ( row_vec & must_bit ) ) break; | ||
| + | |||
| + | | ||
| + | |||
| + | if( m > l + 1 ) { | ||
| + | u64* vrp = vrb + 1; | ||
| + | u64* vrq = vre; | ||
| + | while( vrp < vre ) { | ||
| + | *vrq = *vrp; | ||
| + | vrq += ( row_vec & *vrp ) ? 0 : 1; | ||
| + | vrp ++; | ||
| + | } | ||
| + | MakeRows( m, l + 1, rem ^ row_vec, vre, vrq, res ); | ||
| + | } else { | ||
| + | for( int i = 0; i < m; i ++ ) { | ||
| + | | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | |||
| + | __device__ int gather_para( u64* const & vp, const int & bv, const int & ev, | ||
| + | const u64 & x, u64* const & vpn ) | ||
| + | { | ||
| + | int j = 0; | ||
| + | for( int i = RDWS( bv ); i < ev; i += WS ) { | ||
| + | int i_t = i + threadIdx.x; | ||
| + | u64 v = -1ULL; | ||
| + | if( i_t >=bv and i_t < ev ) v = vp[ i_t ]; | ||
| + | int p = ! ( v & x ); | ||
| + | unsigned ballot = __ballot_sync( AllLanes, p ); | ||
| + | unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); | ||
| + | if( p ) | ||
| + | vpn[ j + sub_sum ] = v; | ||
| + | j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); | ||
| + | } | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ int cnext( u64* vp, int nv, int must_bit ) | ||
| + | { | ||
| + | int j = 0; | ||
| + | for( int i = 0; i < nv; i += WS ) { | ||
| + | u64 v = 0ULL; | ||
| + | if( i + threadIdx.x < nv ) v = vp[ i + threadIdx.x ]; | ||
| + | j += __popc( __ballot_sync( AllLanes, v >> must_bit ) ); | ||
| + | if( j != i + WS ) break; | ||
| + | } | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ int gather_cross( | ||
| + | u64* const & vp, const int & nv, const u64 & x, u64* const & vpn ) | ||
| + | { | ||
| + | int j = 0; | ||
| + | for( int i = 0; i < nv; i += WS ) { | ||
| + | int i_t = i + threadIdx.x; | ||
| + | u64 v = 0ULL; | ||
| + | if( i_t < nv ) v = vp[ i_t ]; | ||
| + | int p = __popcll( v & x ) == 1; | ||
| + | unsigned ballot = __ballot_sync( AllLanes, p ); | ||
| + | unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); | ||
| + | if( p ) | ||
| + | vpn[ j + sub_sum ] = v; | ||
| + | j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); | ||
| + | } | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ int gather_double_cross( u64* const & vp, const int & nv, | ||
| + | const u64 & x1, const u64 & x2, u64* const & vpn ) | ||
| + | { | ||
| + | int j = 0; | ||
| + | for( int i = 0; i < nv; i += WS ) { | ||
| + | int i_t = i + threadIdx.x; | ||
| + | u64 v = 0ULL; | ||
| + | if( i_t < nv ) v = vp[ i_t ]; | ||
| + | int p = __popcll( v & x1 ) == 1 and __popcll( v & x2 ) == 1; | ||
| + | unsigned ballot = __ballot_sync( AllLanes, p ); | ||
| + | unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); | ||
| + | if( p ) | ||
| + | vpn[ j + sub_sum ] = v; | ||
| + | j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); | ||
| + | } | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ int gather_triple_cross( u64* const & vp, const int & nv, | ||
| + | const u64 & x1, const u64 & x2, const u64 & x3, u64* const & vpn ) | ||
| + | { | ||
| + | int j = 0; | ||
| + | for( int i = 0; i < nv; i += WS ) { | ||
| + | int i_t = i + threadIdx.x; | ||
| + | u64 v = 0ULL; | ||
| + | if( i_t < nv ) v = vp[ i_t ]; | ||
| + | int p = __popcll( v & x1 ) == 1 | ||
| + | and __popcll( v & x2 ) == 1 | ||
| + | and __popcll( v & x3 ) == 1; | ||
| + | unsigned ballot = __ballot_sync( AllLanes, p ); | ||
| + | unsigned sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); | ||
| + | if( p ) | ||
| + | vpn[ j + sub_sum ] = v; | ||
| + | j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); | ||
| + | } | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ void set_vec( int index[ 64 ], u64 vecs[ N ], | ||
| + | const u64 & v, const int & level ) | ||
| + | { | ||
| + | vecs[ level ] = v; | ||
| + | if( ( v >> threadIdx.x ) & 1 ) | ||
| + | | ||
| + | if( ( ( v >> WS ) >> threadIdx.x ) & 1 ) | ||
| + | | ||
| + | __syncwarp(); | ||
| + | } | ||
| + | |||
| + | __global__ void RowSet( u64* rowset, u64* vc_g, u64* vx_g ) | ||
| + | { | ||
| + | #define WHICH_ROW which_row[ threadIdx.y ] | ||
| + | #define ROW_VECS row_vecs[ threadIdx.y ] | ||
| + | |||
| + | set_vec( WHICH_ROW, ROW_VECS, v_max_c, N - 1 ); | ||
| + | set_vec( WHICH_ROW, ROW_VECS, v_second_c, N - 2 ); | ||
| + | |||
| + | __shared__ u64* vc_s[ NW ]; | ||
| + | __shared__ u64* vx_s[ NW ]; | ||
| + | #define VC vc_s[ threadIdx.y ] | ||
| + | #define VX vx_s[ threadIdx.y ] | ||
| + | |||
| + | VC = vc_g + RUWS( ncols_c ) + vcsize_c * ( blockIdx.x * NW + threadIdx.y ); | ||
| + | VX = vx_g + RUWS( ndiags_c ) + vxsize_c * ( blockIdx.x * NW + threadIdx.y ); | ||
| + | |||
| + | VC[ vcsize_c - WS ] = 0ULL; | ||
| + | VX[ vxsize_c - WS ] = 0ULL; | ||
| + | |||
| + | u64 count = 0ULL; | ||
| + | |||
| + | for( ; /* i_rowset < nrowset_c */; /* i_rowset += N - 2 */ ) { | ||
| + | |||
| + | | ||
| + | # | ||
| + | |||
| + | if( threadIdx.x == 0 ) { | ||
| + | I_ROWSET = atomicAdd_system( & | ||
| + | } | ||
| + | | ||
| + | // i_rowset = __shfl_sync( AllLanes, i_rowset, 0 ); // 0903 | ||
| + | |||
| + | if( I_ROWSET >= nrowset_c ) break; | ||
| + | |||
| + | | ||
| + | # | ||
| + | | ||
| + | for( int i = 0; i < N - 2; i ++ ) { | ||
| + | u64 row_vec = rowset[ I_ROWSET + i ]; | ||
| + | set_vec( WHICH_ROW, ROW_VECS, row_vec, N - 3 - i ); | ||
| + | SINGLE = SINGLE or rev_max_c == row_vec; | ||
| + | } | ||
| + | |||
| + | | ||
| + | | ||
| + | # | ||
| + | # | ||
| + | |||
| + | #if N == 6 | ||
| + | KVC = gather_triple_cross( vc_g, ncols_c, | ||
| + | | ||
| + | KVX = gather_triple_cross( vx_g, ndiags_c, | ||
| + | | ||
| + | #endif | ||
| + | #if N == 5 | ||
| + | KVC = gather_double_cross( vc_g, ncols_c, | ||
| + | | ||
| + | KVX = gather_double_cross( vx_g, ndiags_c, | ||
| + | | ||
| + | #endif | ||
| + | #if N == 4 | ||
| + | KVC = gather_cross( vc_g, ncols_c, ROW_VECS[ N - 3 ], VC ); | ||
| + | KVX = gather_cross( vx_g, ndiags_c, ROW_VECS[ N - 3 ], VX ); | ||
| + | #endif | ||
| + | #if N == 3 | ||
| + | for( int i = 0; i < ncols_c; i++ ) VC[ i ] = vc_g[ i ]; | ||
| + | for( int i = 0; i < ndiags_c; i++ ) VX[ i ] = vx_g[ i ]; | ||
| + | KVC = ncols_c; | ||
| + | KVX = ndiags_c; | ||
| + | #endif | ||
| + | |||
| + | if( KVC < N or KVX < 2 ) continue; | ||
| + | |||
| + | #if N > 3 | ||
| + | // 53 / 51 ? | ||
| + | count += MakeCols( N - 4, AllBits, VC, KVC, VX, KVX, SINGLE ) ; | ||
| + | #else | ||
| + | int sub_count = 0; | ||
| + | int nvn = cnext( VC, KVC, NN - 1 ); | ||
| + | | ||
| + | count += sub_count; | ||
| + | #endif | ||
| + | |||
| + | #if N > 5 | ||
| + | # | ||
| + | if( blockIdx.x == 0 and threadIdx.x == 0 ) | ||
| + | | ||
| + | # | ||
| + | # | ||
| + | } | ||
| + | if ( VC[ vcsize_c - WS ] != 0ULL || | ||
| + | VX[ vxsize_c - WS ] != 0ULL ) { | ||
| + | | ||
| + | | ||
| + | // exit( 1 ); | ||
| + | } | ||
| + | |||
| + | for( int span = ( WS >> 1 ); span > 0; span >>= 1 ) { | ||
| + | count += __shfl_xor_sync( AllLanes, count, span ); | ||
| + | } | ||
| + | if( threadIdx.x == 0 ) | ||
| + | { | ||
| + | | ||
| + | } | ||
| + | } | ||
| + | |||
| + | #if N > 3 | ||
| + | __device__ int MakeCols( int lvl, u64 rem, | ||
| + | u64* const & vcb, const int & nvc, | ||
| + | u64* const & vxb, const int & nvx, int single ) | ||
| + | { | ||
| + | __shared__ u64 rem_s[ NW ][ N - 3 ]; | ||
| + | __shared__ u64* vcb_s[ NW ][ N - 3 ]; | ||
| + | __shared__ int nvc_s[ NW ][ N - 3 ]; | ||
| + | __shared__ int nvn_s[ NW ][ N - 3 ]; | ||
| + | __shared__ u64* vxb_s[ NW ][ N - 3 ]; | ||
| + | __shared__ int nvx_s[ NW ][ N - 3 ]; | ||
| + | __shared__ int single_s[ NW ][ N - 3 ]; | ||
| + | |||
| + | #define REM_S rem_s[ threadIdx.y ] | ||
| + | #define VCB_S vcb_s[ threadIdx.y ] | ||
| + | #define NVC_S nvc_s[ threadIdx.y ] | ||
| + | #define NVN_S nvn_s[ threadIdx.y ] | ||
| + | #define VXB_S vxb_s[ threadIdx.y ] | ||
| + | #define NVX_S nvx_s[ threadIdx.y ] | ||
| + | #define SINGLE_S single_s[ threadIdx.y ] | ||
| + | |||
| + | REM_S[ lvl ] = rem; | ||
| + | VCB_S[ lvl ] = vcb; | ||
| + | NVC_S[ lvl ] = nvc; | ||
| + | NVN_S[ lvl ] = cnext( vcb, nvc, topbit( rem ) ); | ||
| + | VXB_S[ lvl ] = vxb; | ||
| + | NVX_S[ lvl ] = nvx; | ||
| + | SINGLE_S[ lvl ] = single; | ||
| + | |||
| + | __shared__ int ivc_s[ NW ][ N - 3 ]; | ||
| + | #define IVC_S ivc_s[ threadIdx.y ] | ||
| + | |||
| + | int sub_count = 0; | ||
| + | |||
| + | Init: | ||
| + | |||
| + | | ||
| + | |||
| + | // | ||
| + | |||
| + | Loop: | ||
| + | |||
| + | if( IVC_S[ lvl ] >= NVN_S[ lvl ] ) { | ||
| + | if( lvl == N - 4 ) | ||
| + | | ||
| + | else { | ||
| + | lvl ++; | ||
| + | goto ReInit; | ||
| + | } | ||
| + | } | ||
| + | |||
| + | { // Scope limit for locals. | ||
| + | |||
| + | | ||
| + | # | ||
| + | | ||
| + | |||
| + | | ||
| + | # | ||
| + | VXBN = RUWS( NVX_S[ lvl ] ) + VXB_S[ lvl ]; | ||
| + | |||
| + | | ||
| + | # | ||
| + | # | ||
| + | |||
| + | KVX = gather_cross( VXB_S[ lvl ], NVX_S[ lvl ], COL_VEC, VXBN ); | ||
| + | if ( KVX < 2 ) goto ReInit; // continue; | ||
| + | |||
| + | | ||
| + | # | ||
| + | |||
| + | VCBN = RUWS( NVC_S[ lvl ] ) + VCB_S[ lvl ]; | ||
| + | |||
| + | | ||
| + | # | ||
| + | # | ||
| + | |||
| + | KVC = gather_para( VCB_S[ lvl ], NVN_S[ lvl ], | ||
| + | | ||
| + | | ||
| + | # | ||
| + | |||
| + | KVN = cnext( VCBN, KVC, topbit( REM_S[ lvl ] ^ COL_VEC ) ); | ||
| + | if( KVN < 0 or KVC - KVN < lvl + 2 ) goto ReInit; // continue; | ||
| + | // ^^^^^^^!! | ||
| + | | ||
| + | // ^^^^^^^!! | ||
| + | if( lvl > 0 ) { | ||
| + | |||
| + | REM_S[ lvl - 1 ] = REM_S[ lvl ] ^ COL_VEC; | ||
| + | VCB_S[ lvl - 1 ] = VCBN; | ||
| + | NVC_S[ lvl - 1 ] = KVC; | ||
| + | NVN_S[ lvl - 1 ] = KVN; | ||
| + | VXB_S[ lvl - 1 ] = VXBN; | ||
| + | NVX_S[ lvl - 1 ] = KVX; | ||
| + | SINGLE_S[ lvl - 1 ] = SINGLE_S[ lvl ] || ( rev_max_c == COL_VEC ); | ||
| + | |||
| + | lvl --; | ||
| + | goto Init; | ||
| + | |||
| + | } else { // 42 / 51 | ||
| + | LastCols( REM_S[ lvl ] ^ COL_VEC, VCBN, KVC, KVN, | ||
| + | VXBN, KVX, SINGLE_S[ lvl ] || ( rev_max_c == COL_VEC ), sub_count ); | ||
| + | } | ||
| + | } // Scope limit for locals. | ||
| + | |||
| + | ReInit: | ||
| + | IVC_S[ lvl ] ++; | ||
| + | goto Loop; | ||
| + | } | ||
| + | #endif // N > 3 | ||
| + | |||
| + | __device__ void LastCols( u64 rem, | ||
| + | u64* const & vcb, const int & nvc, const int & nvn, | ||
| + | u64* const & vxb, const int & nvx, int single, int & sub_count ) | ||
| + | { | ||
| + | for( int c2 = 0; c2 < nvn; c2 ++ ) { | ||
| + | u64 v2 = vcb[ c2 ]; | ||
| + | set_vec( which_col[threadIdx.y], | ||
| + | int single2 = single or ( rev_max_c == v2 ); | ||
| + | int nvn1 = cnext( vcb + nvn, nvc - nvn, topbit( rem ^ v2 ) ); // 2 / 51 | ||
| + | for( int c1 = 0; c1 < nvn1; c1 ++ ) { | ||
| + | u64 v1 = vcb[ nvn + c1 ]; | ||
| + | if( ! ( v2 & v1 ) ) { | ||
| + | u64 v0 = rem ^ v1 ^ v2; | ||
| + | if( ( ! ( v0 & 1 ) ) or v_max_c >= Reverse( v0 ) ) { | ||
| + | // 1.5 / 51 | ||
| + | set_vec( which_col[threadIdx.y], | ||
| + | set_vec( which_col[threadIdx.y], | ||
| + | u64* vxbn = RUWS( nvx ) + vxb; | ||
| + | int kvx = gather_double_cross( vxb, nvx, v2, v1, vxbn ); // 6 / 51 | ||
| + | int shift = | ||
| + | ( ( single2 || rev_max_c == v1 || rev_max_c == v0 ) ? 0 : 1 ); | ||
| + | // avr( kvx ) : 8.6, 27% : > 10, 0.5% : > 20, 0.0005% : > 30 | ||
| + | // 16.5 / 51 | ||
| + | while( kvx > MDIAG ) { | ||
| + | sub_count += MakeDiagS( vxbn, kvx ) << shift; | ||
| + | vxbn ++; | ||
| + | kvx --; | ||
| + | } | ||
| + | sub_count += MakeDiagC( vxbn, kvx ) << shift; | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | return; | ||
| + | } | ||
| + | |||
| + | __device__ int MakeDiagS( u64* const & vxb, const int & nvx ) | ||
| + | { | ||
| + | | ||
| + | |||
| + | u64 vdx = *vxb; | ||
| + | for( int ic = 1; ic < nvx; ic += WS ) { | ||
| + | u64 vcx = -1ULL; | ||
| + | if( ic + threadIdx.x < nvx ) vcx = vxb[ ic + threadIdx.x ]; | ||
| + | #if N%2 | ||
| + | if( __popcll( vdx & vcx ) == 1 ) | ||
| + | #else | ||
| + | if( ( vdx & vcx ) == 0 ) | ||
| + | #endif | ||
| + | { | ||
| + | diag_count += // 4 / 51 | ||
| + | FinalCheck( which_row[ threadIdx.y ], which_col[ threadIdx.y ], | ||
| + | | ||
| + | } | ||
| + | } | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ int MakeDiagC( u64* const & vx, const int & vxn ) | ||
| + | { | ||
| + | | ||
| + | |||
| + | for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) { | ||
| + | if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) { | ||
| + | int idx = p_idx_c[ ivx + threadIdx.x ]; | ||
| + | u64 vdx = vx[ idx >> 16 ]; | ||
| + | u64 vcx = vx[ idx & 0xffff ]; | ||
| + | #if N%2 | ||
| + | if( __popcll( vdx & vcx ) == 1 ) | ||
| + | #else | ||
| + | if( ( vdx & vcx ) == 0 ) | ||
| + | #endif | ||
| + | { | ||
| + | diag_count += // 4 / 51 | ||
| + | FinalCheck( which_row[ threadIdx.y ], which_col[ threadIdx.y ], | ||
| + | col_vecs[ threadIdx.y ], row_vecs[ threadIdx.y ], vdx, vcx ); | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | | ||
| + | } | ||
| + | |||
| + | __device__ int FinalCheck( | ||
| + | int* const &e2row, int* const &e2col, u64* const &col_v, u64* const &row_v, | ||
| + | const u64 & vdx, const u64 & vcx ) | ||
| + | { | ||
| + | int i = e2row[ __clzll( vcx & col_v[ 0 ] ) ]; | ||
| + | i = e2col[ __clzll( vdx & row_v[ i ] ) ]; | ||
| + | int j = e2row[ __clzll( vcx & col_v[ i ] ) ]; | ||
| + | j = e2col[ __clzll( vdx & row_v[ j ] ) ]; | ||
| + | if( j != 0 ) return 0; | ||
| + | |||
| + | i = ( i == 1 ) ? 2 : 1; | ||
| + | j = e2row[ __clzll( vcx & col_v[ i ] ) ]; | ||
| + | j = e2col[ __clzll( vdx & row_v[ j ] ) ]; | ||
| + | j = e2row[ __clzll( vcx & col_v[ j ] ) ]; | ||
| + | j = e2col[ __clzll( vdx & row_v[ j ] ) ]; | ||
| + | | ||
| + | } | ||
| + | |||
| + | int main( int argc, char* argv[] ) | ||
| + | { | ||
| + | | ||
| + | | ||
| + | if( sizeof( u64 ) * 8 < NN ) { | ||
| + | fprintf( stderr, | ||
| + | " | ||
| + | exit( 1 ); | ||
| + | } | ||
| + | if( check_topbit( ) ) { | ||
| + | fprintf( stderr, " | ||
| + | exit( 1 ); | ||
| + | } | ||
| + | |||
| + | | ||
| + | | ||
| + | |||
| + | /* pair indexes ----------------------------------- */ | ||
| + | |||
| + | int p_offset[ MDIAG + 2 ]; | ||
| + | int p_idx[ MDPAIR ]; | ||
| + | |||
| + | int ioff = 0; | ||
| + | for( int ndiag = 0; ndiag < MDIAG + 1; ndiag ++ ) { | ||
| + | |||
| + | p_offset[ ndiag ] = ioff; | ||
| + | for( int idx = 0; idx < ndiag - 1; idx ++ ) { | ||
| + | for( int icx = idx + 1; icx < ndiag; icx ++ ) { | ||
| + | | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | | ||
| + | |||
| + | | ||
| + | if( ioff >= MDPAIR ) { | ||
| + | | ||
| + | exit( 1 ); | ||
| + | } | ||
| + | /* ------------------------------------------------ */ | ||
| + | // Get device properties | ||
| + | // Set a part of constant memory | ||
| + | // | ||
| + | int ndev; | ||
| + | | ||
| + | | ||
| + | for( int idev = 0 ; idev < ndev; idev ++ ) { | ||
| + | | ||
| + | | ||
| + | | ||
| + | int nblocks = prop.multiProcessorCount * NB_SM; | ||
| + | | ||
| + | | ||
| + | idev, prop.multiProcessorCount, | ||
| + | | ||
| + | | ||
| + | | ||
| + | } | ||
| + | |||
| + | // | ||
| + | u64 row1P = strtoll( argv[ 2 ], NULL, 16 ); | ||
| + | u64 row2P = strtoll( argv[ 4 ], NULL, 16 ); | ||
| + | | ||
| + | |||
| + | u64 total_count = 0ULL; | ||
| + | |||
| + | for( int max_row = 0; max_row < nvecs; ++ max_row ) { | ||
| + | u64 max_rowP = vecs[ max_row ]; | ||
| + | if( row1P != 0ULL and row1P < max_rowP ) continue; | ||
| + | if( max_rowP < row1P ) break; | ||
| + | if( ! ( max_rowP & 1ULL << ( NN - 1 ) ) ) break; | ||
| + | if( max_rowP < Reverse( max_rowP ) ) continue; | ||
| + | for( int second_row = max_row + 1; second_row < nvecs; ++ second_row ) { | ||
| + | u64 second_rowP = vecs[ second_row ]; | ||
| + | if( row2P != 0ULL and row2P < second_rowP ) continue; | ||
| + | if( second_rowP < row2P ) break; | ||
| + | if( max_rowP & second_rowP ) continue; | ||
| + | if( max_rowP < Reverse( second_rowP ) ) continue; | ||
| + | u64 rem = AllBits ^ max_rowP ^ second_rowP; | ||
| + | if( rem > second_rowP ) break; | ||
| + | |||
| + | | ||
| + | | ||
| + | |||
| + | | ||
| + | |||
| + | | ||
| + | | ||
| + | | ||
| + | |||
| + | # | ||
| + | |||
| + | MD5_CTX md5ctx; | ||
| + | unsigned char md5bin[ MD5_DIGEST_LENGTH ]; | ||
| + | char md5str[ MD5_DIGEST_LENGTH * 2 + 1 ]; | ||
| + | |||
| + | const char* sample = " | ||
| + | int msg_len = strlen( sample ); | ||
| + | char msg[ msg_len + 1 ]; | ||
| + | sprintf( msg, " | ||
| + | vecs[ max_row ], vecs[ second_row ], count_m ); | ||
| + | |||
| + | MD5_Init( &md5ctx ); | ||
| + | MD5_Update( & | ||
| + | MD5_Final( md5bin, &md5ctx ); | ||
| + | |||
| + | for( int i = 0; i < MD5_DIGEST_LENGTH; | ||
| + | sprintf( md5str + ( i * 2 ), " | ||
| + | md5str[ MD5_DIGEST_LENGTH * 2 ] = 0; | ||
| + | |||
| + | printf( " | ||
| + | |||
| + | # | ||
| + | |||
| + | | ||
| + | } | ||
| + | } | ||
| + | if( row2P == 0ULL ) fprintf( stderr, " | ||
| + | |||
| + | | ||
| + | } | ||
| + | |||
| + | </ | ||
ms-20230131.cu.1677834659.txt.gz · Last modified: 2023/03/03 18:10 by mino
