
.. _program_listing_file_library_include_rocrand_rocrand_mtgp32.h:

Program Listing for File rocrand_mtgp32.h
=========================================

|exhale_lsh| :ref:`Return to documentation for file <file_library_include_rocrand_rocrand_mtgp32.h>` (``library/include/rocrand/rocrand_mtgp32.h``)

.. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS

.. code-block:: cpp

   // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
   //
   // Permission is hereby granted, free of charge, to any person obtaining a copy
   // of this software and associated documentation files (the "Software"), to deal
   // in the Software without restriction, including without limitation the rights
   // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   // copies of the Software, and to permit persons to whom the Software is
   // furnished to do so, subject to the following conditions:
   //
   // The above copyright notice and this permission notice shall be included in
   // all copies or substantial portions of the Software.
   //
   // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
   // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
   // THE SOFTWARE.
   
   /*
    * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
    * University.  All rights reserved.
    * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
    * University and University of Tokyo.  All rights reserved.
    *
    * Redistribution and use in source and binary forms, with or without
    * modification, are permitted provided that the following conditions are
    * met:
    *
    *     * Redistributions of source code must retain the above copyright
    *       notice, this list of conditions and the following disclaimer.
    *     * Redistributions in binary form must reproduce the above
    *       copyright notice, this list of conditions and the following
    *       disclaimer in the documentation and/or other materials provided
    *       with the distribution.
    *     * Neither the name of the Hiroshima University nor the names of
    *       its contributors may be used to endorse or promote products
    *       derived from this software without specific prior written
    *       permission.
    *
    * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    */
   
   #ifndef ROCRAND_MTGP32_H_
   #define ROCRAND_MTGP32_H_
   
   #include <stdlib.h>
   
   #ifndef FQUALIFIERS
   #define FQUALIFIERS __forceinline__ __device__
   #endif // FQUALIFIERS_
   
   #include "rocrand/rocrand.h"
   #include "rocrand/rocrand_common.h"
   
   #define MTGP_MEXP 11213
   #define MTGP_N 351
   #define MTGP_FLOOR_2P 256
   #define MTGP_CEIL_2P 512
   #define MTGP_TN MTGP_FLOOR_2P
   #define MTGP_LS (MTGP_TN * 3)
   #define MTGP_BN_MAX 512
   #define MTGP_TS 16
   #define MTGP_STATE 1024
   #define MTGP_MASK 1023
   
   // Source: https://github.com/MersenneTwister-Lab/MTGP/blob/master/mtgp32-fast.h
   struct mtgp32_params_fast_t {
       int mexp;            
       int pos;            
       int sh1;            
       int sh2;            
       uint32_t tbl[16];        
       uint32_t tmp_tbl[16];    
       uint32_t flt_tmp_tbl[16];    
       uint32_t mask;        
       unsigned char poly_sha1[21]; 
   };
   
   namespace rocrand_device {
   
   struct mtgp32_params
   {
       unsigned int pos_tbl[MTGP_BN_MAX];
       unsigned int param_tbl[MTGP_BN_MAX][MTGP_TS];
       unsigned int temper_tbl[MTGP_BN_MAX][MTGP_TS];
       unsigned int single_temper_tbl[MTGP_BN_MAX][MTGP_TS];
       unsigned int sh1_tbl[MTGP_BN_MAX];
       unsigned int sh2_tbl[MTGP_BN_MAX];
       unsigned int mask[1];
   };
   
   typedef mtgp32_params_fast_t mtgp32_fast_params;
   
   struct mtgp32_state
   {
       int offset;
       int id;
       unsigned int status[MTGP_STATE];
   };
   
   inline
   void rocrand_mtgp32_init_state(unsigned int array[],
                                  const mtgp32_fast_params *para, unsigned int seed)
   {
       int i;
       int size = para->mexp / 32 + 1;
       unsigned int hidden_seed;
       unsigned int tmp;
       hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
       tmp = hidden_seed;
       tmp += tmp >> 16;
       tmp += tmp >> 8;
       memset(array, tmp & 0xff, sizeof(unsigned int) * size);
       array[0] = seed;
       array[1] = hidden_seed;
       for (i = 1; i < size; i++)
           array[i] ^= (1812433253) * (array[i - 1] ^ (array[i - 1] >> 30)) + i;
   }
   
   class mtgp32_engine
   {
   public:
       FQUALIFIERS
       mtgp32_engine()
       {
   
       }
   
       FQUALIFIERS
       mtgp32_engine(const mtgp32_state m_state,
                     const mtgp32_params * params,
                     int bid)
       {
           this->m_state = m_state;
           pos_tbl = params->pos_tbl[bid];
           sh1_tbl = params->sh1_tbl[bid];
           sh2_tbl = params->sh2_tbl[bid];
           mask = params->mask[0];
           for (int j = 0; j < MTGP_TS; j++) {
               param_tbl[j] = params->param_tbl[bid][j];
               temper_tbl[j] = params->temper_tbl[bid][j];
               single_temper_tbl[j] = params->single_temper_tbl[bid][j];
           }
       }
   
       FQUALIFIERS
       void copy(const mtgp32_engine * m_engine)
       {
   #if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
           const unsigned int thread_id = threadIdx.x;
           for(int i = thread_id; i < MTGP_STATE; i += blockDim.x)
               m_state.status[i] = m_engine->m_state.status[i];
   
           if (thread_id == 0)
           {
               m_state.offset = m_engine->m_state.offset;
               m_state.id = m_engine->m_state.id;
               pos_tbl = m_engine->pos_tbl;
               sh1_tbl = m_engine->sh1_tbl;
               sh2_tbl = m_engine->sh2_tbl;
               mask = m_engine->mask;
           }
           if (thread_id < MTGP_TS)
           {
               param_tbl[thread_id] = m_engine->param_tbl[thread_id];
               temper_tbl[thread_id] = m_engine->temper_tbl[thread_id];
               single_temper_tbl[thread_id] = m_engine->single_temper_tbl[thread_id];
           }
           __syncthreads();
   #else
           this->m_state = m_engine->m_state;
           pos_tbl = m_engine->pos_tbl;
           sh1_tbl = m_engine->sh1_tbl;
           sh2_tbl = m_engine->sh2_tbl;
           mask = m_engine->mask;
           for (int j = 0; j < MTGP_TS; j++) {
               param_tbl[j] = m_engine->param_tbl[j];
               temper_tbl[j] = m_engine->temper_tbl[j];
               single_temper_tbl[j] = m_engine->single_temper_tbl[j];
           }
   #endif
       }
   
       FQUALIFIERS
       void set_params(mtgp32_params * params)
       {
           pos_tbl = params->pos_tbl[m_state.id];
           sh1_tbl = params->sh1_tbl[m_state.id];
           sh2_tbl = params->sh2_tbl[m_state.id];
           mask = params->mask[0];
           for (int j = 0; j < MTGP_TS; j++) {
               param_tbl[j] = params->param_tbl[m_state.id][j];
               temper_tbl[j] = params->temper_tbl[m_state.id][j];
               single_temper_tbl[j] = params->single_temper_tbl[m_state.id][j];
           }
       }
   
       FQUALIFIERS
       unsigned int operator()()
       {
           return this->next();
       }
   
       FQUALIFIERS
       unsigned int next()
       {
   #if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
           unsigned int t   = threadIdx.x;
           unsigned int d   = blockDim.x;
           int pos = pos_tbl;
           unsigned int r;
           unsigned int o;
   
           r = para_rec(m_state.status[(t + m_state.offset) & MTGP_MASK],
                        m_state.status[(t + m_state.offset + 1) & MTGP_MASK],
                        m_state.status[(t + m_state.offset + pos) & MTGP_MASK]);
           m_state.status[(t + m_state.offset + MTGP_N) & MTGP_MASK] = r;
   
           o = temper(r, m_state.status[(t + m_state.offset + pos - 1) & MTGP_MASK]);
           __syncthreads();
           if (t == 0)
               m_state.offset = (m_state.offset + d) & MTGP_MASK;
           __syncthreads();
           return o;
   #else
           return 0;
   #endif
       }
   
       FQUALIFIERS
       unsigned int next_single()
       {
   #if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
           unsigned int t   = threadIdx.x;
           unsigned int d   = blockDim.x;
           int pos = pos_tbl;
           unsigned int r;
           unsigned int o;
   
           r = para_rec(m_state.status[(t + m_state.offset) & MTGP_MASK],
                        m_state.status[(t + m_state.offset + 1) & MTGP_MASK],
                        m_state.status[(t + m_state.offset + pos) & MTGP_MASK]);
           m_state.status[(t + m_state.offset + MTGP_N) & MTGP_MASK] = r;
   
           o = temper_single(r, m_state.status[(t + m_state.offset + pos - 1) & MTGP_MASK]);
           __syncthreads();
           if (t == 0)
               m_state.offset = (m_state.offset + d) & MTGP_MASK;
           __syncthreads();
           return o;
   #else
           return 0;
   #endif
       }
   
   private:
       FQUALIFIERS
       unsigned int para_rec(unsigned int X1, unsigned int X2, unsigned int Y)
       {
           unsigned int X = (X1 & mask) ^ X2;
           unsigned int MAT;
   
           X ^= X << sh1_tbl;
           Y = X ^ (Y >> sh2_tbl);
           MAT = param_tbl[Y & 0x0f];
           return Y ^ MAT;
       }
   
       FQUALIFIERS
       unsigned int temper(unsigned int V, unsigned int T)
       {
           unsigned int MAT;
   
           T ^= T >> 16;
           T ^= T >> 8;
           MAT = temper_tbl[T & 0x0f];
           return V ^ MAT;
       }
   
       FQUALIFIERS
       unsigned int temper_single(unsigned int V, unsigned int T)
       {
           unsigned int MAT;
           unsigned int r;
   
           T ^= T >> 16;
           T ^= T >> 8;
           MAT = single_temper_tbl[T & 0x0f];
           r = (V >> 9) ^ MAT;
           return r;
       }
   
   public:
       // State
       mtgp32_state m_state;
       // Parameters
       unsigned int pos_tbl;
       unsigned int param_tbl[MTGP_TS];
       unsigned int temper_tbl[MTGP_TS];
       unsigned int sh1_tbl;
       unsigned int sh2_tbl;
       unsigned int single_temper_tbl[MTGP_TS];
       unsigned int mask;
   
   }; // mtgp32_engine class
   
   } // end namespace rocrand_device
   
   typedef rocrand_device::mtgp32_engine rocrand_state_mtgp32;
   typedef rocrand_device::mtgp32_state mtgp32_state;
   typedef rocrand_device::mtgp32_fast_params mtgp32_fast_params;
   typedef rocrand_device::mtgp32_params mtgp32_params;
   
   __host__ inline
   rocrand_status rocrand_make_state_mtgp32(rocrand_state_mtgp32 * d_state,
                                            mtgp32_fast_params params[],
                                            int n,
                                            unsigned long long seed)
   {
       int i;
       rocrand_state_mtgp32 * h_state = (rocrand_state_mtgp32 *) malloc(sizeof(rocrand_state_mtgp32) * n);
       seed = seed ^ (seed >> 32);
   
       if (h_state == NULL)
           return ROCRAND_STATUS_ALLOCATION_FAILED;
   
       for (i = 0; i < n; i++) {
           rocrand_device::rocrand_mtgp32_init_state(&(h_state[i].m_state.status[0]), &params[i], (unsigned int)seed + i + 1);
           h_state[i].m_state.offset = 0;
           h_state[i].m_state.id = i;
           h_state[i].pos_tbl = params[i].pos;
           h_state[i].sh1_tbl = params[i].sh1;
           h_state[i].sh2_tbl = params[i].sh2;
           h_state[i].mask = params[0].mask;
           for (int j = 0; j < MTGP_TS; j++) {
               h_state[i].param_tbl[j] = params[i].tbl[j];
               h_state[i].temper_tbl[j] = params[i].tmp_tbl[j];
               h_state[i].single_temper_tbl[j] = params[i].flt_tmp_tbl[j];
           }
       }
   
       hipMemcpy(d_state, h_state, sizeof(rocrand_state_mtgp32) * n, hipMemcpyHostToDevice);
       free(h_state);
   
       if (hipGetLastError() != hipSuccess)
           return ROCRAND_STATUS_ALLOCATION_FAILED;
   
       return ROCRAND_STATUS_SUCCESS;
   }
   
   __host__ inline
   rocrand_status rocrand_make_constant(const mtgp32_fast_params params[], mtgp32_params * p)
   {
       const int block_num = MTGP_BN_MAX;
       const int size1 = sizeof(uint32_t) * block_num;
       const int size2 = sizeof(uint32_t) * block_num * MTGP_TS;
       uint32_t *h_pos_tbl;
       uint32_t *h_sh1_tbl;
       uint32_t *h_sh2_tbl;
       uint32_t *h_param_tbl;
       uint32_t *h_temper_tbl;
       uint32_t *h_single_temper_tbl;
       uint32_t *h_mask;
       h_pos_tbl = (uint32_t *)malloc(size1);
       h_sh1_tbl = (uint32_t *)malloc(size1);
       h_sh2_tbl = (uint32_t *)malloc(size1);
       h_param_tbl = (uint32_t *)malloc(size2);
       h_temper_tbl = (uint32_t *)malloc(size2);
       h_single_temper_tbl = (uint32_t *)malloc(size2);
       h_mask = (uint32_t *)malloc(sizeof(uint32_t));
       rocrand_status status = ROCRAND_STATUS_SUCCESS;
   
       if (h_pos_tbl == NULL || h_sh1_tbl == NULL || h_sh2_tbl == NULL
           || h_param_tbl == NULL || h_temper_tbl == NULL || h_single_temper_tbl == NULL
           || h_mask == NULL) {
           printf("failure in allocating host memory for constant table.\n");
           status = ROCRAND_STATUS_ALLOCATION_FAILED;
       }
       else {
           h_mask[0] = params[0].mask;
           for (int i = 0; i < block_num; i++) {
               h_pos_tbl[i] = params[i].pos;
               h_sh1_tbl[i] = params[i].sh1;
               h_sh2_tbl[i] = params[i].sh2;
               for (int j = 0; j < MTGP_TS; j++) {
                   h_param_tbl[i * MTGP_TS + j] = params[i].tbl[j];
                   h_temper_tbl[i * MTGP_TS + j] = params[i].tmp_tbl[j];
                   h_single_temper_tbl[i * MTGP_TS + j] = params[i].flt_tmp_tbl[j];
               }
           }
   
           if (hipMemcpy(p->pos_tbl, h_pos_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
               status = ROCRAND_STATUS_ALLOCATION_FAILED;
           if (hipMemcpy(p->sh1_tbl, h_sh1_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
               status = ROCRAND_STATUS_ALLOCATION_FAILED;
           if (hipMemcpy(p->sh2_tbl, h_sh2_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
               status = ROCRAND_STATUS_ALLOCATION_FAILED;
           if (hipMemcpy(p->param_tbl, h_param_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
               status = ROCRAND_STATUS_ALLOCATION_FAILED;
           if (hipMemcpy(p->temper_tbl, h_temper_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
               status = ROCRAND_STATUS_ALLOCATION_FAILED;
           if (hipMemcpy(p->single_temper_tbl, h_single_temper_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
               status = ROCRAND_STATUS_ALLOCATION_FAILED;
           if (hipMemcpy(p->mask, h_mask, sizeof(unsigned int), hipMemcpyHostToDevice) != hipSuccess)
               status = ROCRAND_STATUS_ALLOCATION_FAILED;
       }
   
       free(h_pos_tbl);
       free(h_sh1_tbl);
       free(h_sh2_tbl);
       free(h_param_tbl);
       free(h_temper_tbl);
       free(h_single_temper_tbl);
       free(h_mask);
   
       return status;
   }
   
   FQUALIFIERS
   unsigned int rocrand(rocrand_state_mtgp32 * state)
   {
       return state->next();
   }
   
   FQUALIFIERS
   void rocrand_mtgp32_block_copy(rocrand_state_mtgp32 * src, rocrand_state_mtgp32 * dest)
   {
       dest->copy(src);
   }
   
   FQUALIFIERS
   void rocrand_mtgp32_set_params(rocrand_state_mtgp32 * state, mtgp32_params * params)
   {
       state->set_params(params);
   }
   
    // end of group rocranddevice
   #endif // ROCRAND_MTGP32_H_
