How to use the numba.cuda.to_device function in numba

To help you get started, we’ve selected a few numba examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fbpic / fbpic / tests / unautomated / test_cuda_transform.py View on Github external
# Parameters
    Nz = 2048
    Nr = 256
    rmax = 50.e-6
    m = 0

    # Initialize the random test_field
    interp_field_r = np.random.rand(Nz, Nr) + 1.j*np.random.rand(Nz, Nr)
    interp_field_t = np.random.rand(Nz, Nr) + 1.j*np.random.rand(Nz, Nr)
    d_interp_field_r = cuda.to_device( interp_field_r )
    d_interp_field_t = cuda.to_device( interp_field_t )
    # Initialize the field in spectral space
    spect_field_p = np.empty_like( interp_field_r )
    spect_field_m = np.empty_like( interp_field_t )
    d_spect_field_p = cuda.to_device( spect_field_p )
    d_spect_field_m = cuda.to_device( spect_field_m )
    # Initialize the field after back and forth transformation
    back_field_r = np.empty_like( interp_field_r )
    back_field_t = np.empty_like( interp_field_t )
    d_back_field_r = cuda.to_device( back_field_r )
    d_back_field_t = cuda.to_device( back_field_t )

    # ----------------
    # Scalar transform
    # ----------------
    print( '\n ### Scalar transform \n' )
    
    # Perform the transform on the CPU
    trans_cpu = SpectralTransformer( Nz, Nr, m, rmax )
    # Do a loop so as to get the fastest time
    # and remove compilation time
    tmin = 1.
github rapidsai / cuxfilter / python / cuxfilter / assets / numba_kernels / gpu_datatile.py View on Github external
results = []
    for groupby_result in groupby_results:

        list_of_indices = list(
            np.unique(groupby_result[check_list[-1]].to_array().astype(int))
        )
        groupby_as_ndarray = cuda.to_device(
            groupby_result.to_pandas().values.astype(float)
        )

        del groupby_result
        gc.collect()
        max_s = int((max_1 - min_1) / stride_1) + 1
        min_s = int((max_2 - min_2) / stride_2) + 1
        result = cuda.to_device(
            np.zeros(shape=(min_s, max_s)).astype(np.float64)
        )

        calc_cumsum_data_tile[64, 64](groupby_as_ndarray, result)
        if not cumsum:
            result_np = result.copy_to_host()
        else:
            result_np = np.cumsum(result.copy_to_host(), axis=1)

        result_temp = format_result(result_np, return_format)

        results.append(result_temp[result_temp.index.isin(list_of_indices)])

    if len(results) == 1:
        return results[0]
github fbpic / fbpic / fbpic / particles / tracking / tracking.py View on Github external
def send_to_gpu(self):
        """
        Transfer the tracking data from the CPU to the GPU
        """
        self.id = cuda.to_device( self.id )
github xnd-project / arrayviews / arrayviews / cuda / numba_cuda_DeviceNDArray_as.py View on Github external
def random(size):
    import numba.cuda as cuda
    import numpy as np
    arr = np.random.randint(low=0, high=255, size=size,
                            dtype=np.uint8)
    return cuda.to_device(arr)
github numba / numba / examples / cudajit / matmul.py View on Github external
C[y, x] = 0
    for i in range(n):
        C[y, x] += A[y, i] * B[i, x]


A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.empty_like(A)

print("N = %d x %d" % (n, n))

s = time()
stream = cuda.stream()
with stream.auto_synchronize():
    dA = cuda.to_device(A, stream)
    dB = cuda.to_device(B, stream)
    dC = cuda.to_device(C, stream)
    cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
    dC.to_host(stream)

e = time()
tcuda = e - s

# Host compute
Amat = np.matrix(A)
Bmat = np.matrix(B)

s = time()
Cans = Amat * Bmat
e = time()
tcpu = e - s
github numba / numba / examples / blackscholes / blackscholes_cuda.py View on Github external
time0 = time.time()
    for i in range(iterations):
        black_scholes_numba(callResultNumba, putResultNumba, stockPrice,
                            optionStrike, optionYears, RISKFREE, VOLATILITY)
    time1 = time.time()
    print("Numba Time: %f msec" %
          ((1000 * (time1 - time0)) / iterations))

    time0 = time.time()
    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
    stream = cuda.stream()
    d_callResult = cuda.to_device(callResultCuda, stream)
    d_putResult = cuda.to_device(putResultCuda, stream)
    d_stockPrice = cuda.to_device(stockPrice, stream)
    d_optionStrike = cuda.to_device(optionStrike, stream)
    d_optionYears = cuda.to_device(optionYears, stream)
    time1 = time.time()
    for i in range(iterations):
        black_scholes_cuda[griddim, blockdim, stream](
            d_callResult, d_putResult, d_stockPrice, d_optionStrike,
            d_optionYears, RISKFREE, VOLATILITY)
        d_callResult.to_host(stream)
        d_putResult.to_host(stream)
        stream.synchronize()
    time2 = time.time()
    dt = (time1 - time0) * 10 + (time2 - time1)
    print("Numba / CUDA time: %f msec" % ((1000 * dt) / iterations))

    delta = np.abs(callResultNumpy - callResultCuda)
    L1norm = delta.sum() / np.abs(callResultNumpy).sum()
    print("L1 norm: %E" % L1norm)
github numba / numba / examples / laplace2d / laplace2d-numba-cuda.py View on Github external
Anew[j, 0] = 1.0

    print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))

    timer = time.time()
    iter = 0

    blockdim = (32, 32)
    griddim = (NN//blockdim[0], NM//blockdim[1])

    error_grid = np.zeros_like(A)

    stream = cuda.stream()

    dA = cuda.to_device(A, stream)          # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)

    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64

        jacobi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

        derror_grid.to_host(stream)


        # error_grid is available on host
        stream.synchronize()

        error = np.abs(error_grid).max()

        # swap dA and dAnew
github rapidsai / cuxfilter / python_scripts / numbaHist.py View on Github external
def numba_gpu_histogram(a, bins):
    # Move data to GPU so we can do two operations on it
    a_gpu = cuda.to_device(a)

    # a_gpu = a
    # index_gpu = cuda.to_device(np.array(index))
    ### Find min and max value in array
    dtype_min, dtype_max = dtype_min_max(a.dtype)
    # Put them in the array in reverse order so that they will be replaced by the first element in the array
    min_max_array_gpu = cuda.to_device(np.array([dtype_max, dtype_min], dtype=a.dtype))
    min_max[64, 64](a_gpu, min_max_array_gpu)
    a_min, a_max = min_max_array_gpu.copy_to_host()
    # print(a_min, a_max)
    # SPEEDTIP: Skip this step if you don't need to reproduce the NumPy histogram edge array
    bin_edges = get_bin_edges(bins, a_min, a_max) # Doing this on CPU for now

    # counter = cuda.to_device(np.array([0]))
    ### Bin the data into a histogram 
    # print(bins)
    histogram_out = cuda.to_device(np.zeros(shape=(bins,), dtype=np.int32))
    histogram[64, 64](a_gpu, a_min, a_max, histogram_out)
    # print(counter.copy_to_host())
    # sol = histogram_out.copy_to_host()
    # print(sol)
    # print(len(sol))
    return histogram_out.copy_to_host(), bin_edges
github TDAmeritrade / stumpy / stumpy / gpu_stump.py View on Github external
device_T_A = cuda.to_device(T_A)
        device_T_B = cuda.to_device(T_B)
        device_M_T = cuda.to_device(M_T)
        device_Σ_T = cuda.to_device(Σ_T)
        device_QT_odd = cuda.to_device(QT)
        device_QT_even = cuda.to_device(QT)
        device_QT_first = cuda.to_device(QT_first)
        device_μ_Q = cuda.to_device(μ_Q)
        device_σ_Q = cuda.to_device(σ_Q)

        profile = np.empty((k, 3))  # float64
        indices = np.empty((k, 3))  # int64

        profile[:] = np.inf
        indices[:, :] = -1
        device_profile = cuda.to_device(profile)
        device_indices = cuda.to_device(indices)

        _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block](
            range_start - 1,
            device_T_A,
            device_T_B,
            m,
            device_QT_even,
            device_QT_odd,
            device_QT_first,
            device_M_T,
            device_Σ_T,
            device_μ_Q,
            device_σ_Q,
            k,
            ignore_trivial,