samle.txt
__device__ int cnext( const u64* __restrict__ vp, 
              const int & nv, const int must_bit )
{
   int j = 0;
#pragma unroll 1
   for( int i = 0; i < nv; i += WS ) {
      bool p = false;
      if( i + threadIdx.x < nv ) {
         p = vp[ i + threadIdx.x ] >> must_bit;
      }
      j += __popc( __ballot_sync( AllLanes, p ) );
      if( j != i + WS ) break;
   }
   return j;
}
 
__device__ int gather_cross( 
       const u64* __restrict__ const & vp, const int & nv, 
       const u64 & x, u64* __restrict__ const & vpn )
{
   int j = 0;
#pragma unroll 1
   for( int i = 0; i < nv; i += WS ) {
      u64 v = 0ULL;
      if( i + threadIdx.x < nv ) 
          v = vp[ i + threadIdx.x ];
      bool p = __popcll( v & x ) == 1; 
      unsigned ballot = __ballot_sync( AllLanes, p );
      int sub_sum = __popc( ballot << ( WS - threadIdx.x ) );
      if( p )
          vpn[ j + sub_sum ] = v;
      j += __shfl_sync( AllLanes, sub_sum + p, WS - 1 );
   }
   return j;
}
 
__device__ void set_vecr( u64 vecs[ ], int index[ ], 
                             const u64 & v, const int & level ) 
{
  vecs[ level ] = v;
  if( ( v >> threadIdx.x ) & 1 )  {
     index[ NN - 1 - threadIdx.x ] = level;
  }
  if( ( ( v >> WS ) >> threadIdx.x ) & 1 )  {
     index[ NN - 1 - ( WS + threadIdx.x ) ] = level;
  }
  __syncwarp();
}
 
__device__ void LastCols( unsigned rem, 
       const u64* __restrict__ const & vcb, const int & nvn, const int nvcmn, 
       u64* __restrict__ const & vxb, const int & nvx, 
       const bool single, u64 & sub_count, int & sm_count )
{
 
  unsigned thread_bit = 1U << threadIdx.x;
  u64* thread_ep =  & which_colv[ threadIdx.y ] [ threadIdx.x ];
 
  int nc21 = nvn * ( nvcmn - 1 );
  for( int ic21w = 0; ic21w < nc21; ic21w += WS ) {
 
    bool v1ep = false;
    bool psingl = single;
 
    if( ic21w + threadIdx.x < nc21 ) {
      u64 v2 = vcb[ ( ic21w + threadIdx.x ) / ( nvcmn - 1 ) ];
      unsigned v1 = 
          unsigned( vcb[ nvn + ( ic21w + threadIdx.x ) % ( nvcmn - 1 ) ] );
      unsigned v0 = rem ^ unsigned( v2 ) ^ v1;
      v1ep = ( v1 >> topbit( rem ^ unsigned( v2 ) ) ) 
               and ( ( v1 & unsigned(v2) ) == 0 ) 
               and ( v_max32_c >= __brev(v0) );
      psingl = psingl 
                 or rev_max32_c == v2
                 or rev_max32_c == v1 
                 or rev_max32_c == v0 ;
    }
    unsigned v1e = __brev( __ballot_sync( AllLanes, v1ep ) );
    unsigned dbl = __ballot_sync( AllLanes, ! psingl );
 
    int ic21 = 0;
    while( v1e ) {
 
      int skip = __clz( v1e );
      ic21 += skip;
 
      u64 v2 = vcb[ ( ic21w + ic21 ) / ( nvcmn - 1 ) ];
      unsigned v1 = unsigned( vcb[ nvn + ( ic21w + ic21 ) % ( nvcmn - 1 ) ] );
 
      dbl >>= ( skip );
      v1e <<= ( skip + 1 );
      ic21 ++;
 
      if( unsigned( v2 ) & thread_bit ) *thread_ep = v2;
      if( unsigned( v2 >> WS ) & thread_bit ) *( thread_ep + WS ) = v2;
      if( ( rem ^ unsigned( v2 ) ) & thread_bit ) {
         *thread_ep 
           = u64( v1 ^ ( ( v1 & thread_bit ) ? 0 : ( rem ^ unsigned( v2 ) ) ) );
         // __syncwarp(); // synced in the gather_double_cross below.
      }
 
      u64* vxbn = RUWS( nvx ) + vxb;
      int kvx = gather_double_cross( vxb, nvx, v2, v1, vxbn );// 8/ 42
 
      int shift = ( dbl & 1 );
      dbl >>= 1;
 
      // avr( kvx ) : 8.6,  27% : > 10,  0.5% : > 20,  0.0005% : > 30
      while( kvx > MDIAG ) { // 14 / 42
        sub_count += MakeDiagL( vxbn, kvx ) << shift;
        vxbn ++;
        kvx --;
      }
      sub_count += MakeDiag( vxbn, kvx ) << shift;
      sm_count += ( 1 << shift );
 
       __syncwarp(); // !!
         // necessary to prevent shared data from being updated too early.
    } 
  }
  return;
}
 
__device__ 
int MakeDiag( const u64* __restrict__ const & vx,  const int & vxn ) 
{
   int  diag_count = 0;
#pragma unroll 1
   for( int ivx = p_offset_c[ vxn ]; ivx < p_offset_c[ vxn + 1 ]; ivx += WS ) {
      if( ivx + threadIdx.x < p_offset_c[ vxn + 1 ] ) {
         unsigned int idx  = p_idx_c[ ivx + threadIdx.x ];
         u64 vdx = vx[ idx >> 16 ];
         u64 vcx = vx[ idx & 0xffff ];
#if N%2
         if( __popcll( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) ) == 1 ) 
#else
         if( ( vdx & vcx & ( ( 1ULL << 40 ) - 1 ) )  == 0 ) 
#endif
         {   // 4-10 / 42
            if( FinalCheck( vdx, vcx ) ) diag_count ++;
         }
      }
   }
   return diag_count;
}
 
__device__ bool FinalCheck( const u64 & vdx, const u64 & vcx )
{
   int* e2row = which_row[ threadIdx.y ] + NN - 64;
   u64* e2colv = which_colv[ threadIdx.y ];
   int i;
   i = e2row[ __clzll( vdx & e2colv[ ( vcx >> 40 ) & 0xff ] ) ];
   if( e2row[ __clzll( vcx & e2colv[ ( vdx >> 40 ) & 0xff ] ) ] != i ) 
       return false;
 
   i = 40 + ( ( ( i & 1 ) + 1 ) << 3 );
 
   return ( e2row[ __clzll( vdx & e2colv[ ( vcx >> i ) & 0xff ] ) ] 
            ==
            e2row[ __clzll( vcx & e2colv[ ( vdx >> i ) & 0xff ] ) ] ) ;
}