sample
- samle.txt
__device__ int cnext( const u64* __restrict__ vp, const int & nv, const int must_bit ) { int j = 0; #pragma unroll 1 for( int i = 0; i < nv; i += WS ) { bool p = false; if( i + threadIdx.x < nv ) { p = vp[ i + threadIdx.x ] >> must_bit; } j += __popc( __ballot_sync( AllLanes, p ) ); if( j != i + WS ) break; } return j; } __device__ int gather_cross( const u64* __restrict__ const & vp, const int & nv, const u64 & x, u64* __restrict__ const & vpn ) { int j = 0; #pragma unroll 1 for( int i = 0; i < nv; i += WS ) { u64 v = 0ULL; if( i + threadIdx.x < nv ) v = vp[ i + threadIdx.x ]; bool p = __popcll( v & x ) == 1; unsigned ballot = __ballot_sync( AllLanes, p ); int sub_sum = __popc( ballot << ( WS - threadIdx.x ) ); if( p ) vpn[ j + sub_sum ] = v; j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 ); } return j; } __device__ void set_vecr( u64 vecs[ ], int index[ ], const u64 & v, const int & level ) { vecs[ level ] = v; if( ( v >> threadIdx.x ) & 1 ) { index[ NN - 1 - threadIdx.x ] = level; } if( ( ( v >> WS ) >> threadIdx.x ) & 1 ) { index[ NN - 1 - ( WS + threadIdx.x ) ] = level; } __syncwarp(); } __device__ void LastCols( unsigned rem, const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn, u64* __restrict__ const & vxb, const int & nvx, const bool single, u64 & sub_count, int & sm_count ) { unsigned thread_bit = 1U << threadIdx.x; u64* thread_ep = & which_colv[ threadIdx.y ] [ threadIdx.x ]; int nc21 = nvn * ( nvcmn - 1 ); for( int ic21w = 0; ic21w < nc21; ic21w += WS ) { bool v1ep = false; bool psingl = single; if( ic21w + threadIdx.x < nc21 ) { u64 v2 = vcb[ ( ic21w + threadIdx.x ) / ( nvcmn - 1 ) ]; unsigned v1 = unsigned( vcb[ nvn + ( ic21w + threadIdx.x ) % ( nvcmn - 1 ) ] ); unsigned v0 = rem ^ unsigned( v2 ) ^ v1; v1ep = ( v1 >> topbit( rem ^ unsigned( v2 ) ) ) and ( ( v1 & unsigned(v2) ) == 0 ) and ( v_max32_c >= __brev(v0) ); psingl = psingl or rev_max32_c == v2 or rev_max32_c == v1 or rev_max32_c == v0 ; } unsigned v1e = __brev( __ballot_sync( AllLanes, v1ep ) ); unsigned dbl = __ballot_sync( AllLanes, ! psingl ); int ic21 = 0; while( v1e ) { int skip = __clz( v1e ); ic21 += skip; u64 v2 = vcb[ ( ic21w + ic21 ) / ( nvcmn - 1 ) ]; unsigned v1 = unsigned( vcb[ nvn + ( ic21w + ic21 ) % ( nvcmn - 1 ) ] ); dbl >>= ( skip ); v1e <<= ( skip + 1 ); ic21 ++; if( unsigned( v2 ) & thread_bit ) *thread_ep = v2; if( unsigned( v2 >> WS ) & thread_bit ) *( thread_ep + WS ) = v2; if( ( rem ^ unsigned( v2 ) ) & thread_bit ) { *thread_ep = u64( v1 ^ ( ( v1 & thread_bit ) ? 0 : ( rem ^ unsigned( v2 ) ) ) ); // __syncwarp(); // synced in the gather_double_cross below. } u64* vxbn = RUWS( nvx ) + vxb; int kvx = gather_double_cross( vxb, nvx, v2, v1, vxbn );// 8/ 42 int shift = ( dbl & 1 ); dbl >>= 1; // avr( kvx ) : 8.6, 27% : > 10, 0.5% : > 20, 0.0005% : > 30 while( kvx > MDIAG ) { // 14 / 42 sub_count += MakeDiagL( vxbn, kvx ) << shift; vxbn ++; kvx --; } sub_count += MakeDiag( vxbn, kvx ) << shift; sm_count += ( 1 << shift ); __syncwarp(); // !! // necessary to prevent shared data from being updated too early. } } return; } __device__ int MakeDiag( const u64* __restrict__ const & vx, const int & vxn ) { int diag_count = 0; #pragma unroll 1 for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) { if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) { unsigned int idx = p_idx_c[ ivx + threadIdx.x ]; u64 vdx = vx[ idx >> 16 ]; u64 vcx = vx[ idx & 0xffff ]; #if N%2 if( __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1 ) #else if( ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 0 ) #endif { // 4-10 / 42 if( FinalCheck( vdx, vcx ) ) diag_count ++; } } } return diag_count; } __device__ bool FinalCheck( const u64 & vdx, const u64 & vcx ) { int* e2row = which_row[ threadIdx.y ] + NN - 64; u64* e2colv = which_colv[ threadIdx.y ]; int i; i = e2row[ __clzll( vdx & e2colv[ ( vcx >> 40 ) & 0xff ] ) ]; if( e2row[ __clzll( vcx & e2colv[ ( vdx >> 40 ) & 0xff ] ) ] != i ) return false; i = 40 + ( ( ( i & 1 ) + 1 ) << 3 ); return ( e2row[ __clzll( vdx & e2colv[ ( vcx >> i ) & 0xff ] ) ] == e2row[ __clzll( vcx & e2colv[ ( vdx >> i ) & 0xff ] ) ] ) ; }
sample.txt · Last modified: 2024/08/26 08:37 by 127.0.0.1
