How to use the numba.cuda function in numba

To help you get started, we’ve selected a few numba examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github numba / numba / numba / numba_entry.py View on Github external
msg_generic_problem = "Error: CUDA device intialisation problem."
            msg = getattr(e, 'msg', None)
            if msg is not None:
                if msg_not_found in msg:
                    err_msg = msg_not_found + msg_end
                elif msg_disabled_by_user in msg:
                    err_msg = msg_disabled_by_user + msg_end
                else:
                    err_msg = msg_generic_problem + " Message:" + msg
            else:
                err_msg = msg_generic_problem + " " + str(e)
            # Best effort error report
            print("%s\nError class: %s" % (err_msg, str(type(e))))
        else:
            try:
                cu.detect()
                dv = ct.c_int(0)
                cudriver.cuDriverGetVersion(ct.byref(dv))
                print(fmt % ("CUDA driver version", dv.value))
                print("CUDA libraries:")
                cudadrv.libs.test(sys.platform, print_paths=False)
            except:
                print(
                    "Error: Probing CUDA failed (device and driver present, runtime problem?)\n")

        print("")
        print("__ROC Information__")
        roc_is_available = roc.is_available()
        print(fmt % ("ROC available", roc_is_available))

        toolchains = []
        try:
github numba / numba / examples / cuda_mpi.py View on Github external
@cuda.jit
def sqplus2(input_data, output_data):
    for i in range(len(input_data)):
        d = input_data[i]
        output_data[i] = d * d + 2
github rapidsai / cuxfilter / python_scripts / numbaHistinMem.py View on Github external
def histogram(x, x_range, histogram_out):
    nbins = histogram_out.shape[0]
    xmin, xmax = x_range
    bin_width = (xmax - xmin) / nbins
    start = cuda.grid(1)
    stride = cuda.gridsize(1)
    for i in range(start, x.shape[0], stride):
        # note that calling a numba.jit function from CUDA automatically
        # compiles an equivalent CUDA device function!
        bin_number = compute_bin(x[i], nbins, xmin, xmax)
        # counter[0] = counter[0] + 1
        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
            cuda.atomic.add(histogram_out, bin_number, 1)
github numba / numba / numba / cuda / cudadecl.py View on Github external
class Cuda_shared_array(MacroTemplate):
    key = cuda.shared.array


class Cuda_local_array(MacroTemplate):
    key = cuda.local.array


class Cuda_const_arraylike(MacroTemplate):
    key = cuda.const.array_like


@intrinsic
class Cuda_syncthreads(ConcreteTemplate):
    key = cuda.syncthreads
    cases = [signature(types.none)]


@intrinsic
class Cuda_syncthreads_count(ConcreteTemplate):
    key = cuda.syncthreads_count
    cases = [signature(types.i4, types.i4)]


@intrinsic
class Cuda_syncthreads_and(ConcreteTemplate):
    key = cuda.syncthreads_and
    cases = [signature(types.i4, types.i4)]


@intrinsic
github numba / numba / examples / cuda_ipc.py View on Github external
def parent():
    arr = np.arange(10)
    darr = cuda.to_device(arr)
    ipch = darr.get_ipc_handle()

    # launch child proc
    mpc = mp.get_context('spawn')
    queue = mpc.Queue()
    childproc = mpc.Process(target=child, args=[queue])

    childproc.start()
    queue.put(ipch)
    childproc.join(1)
    hostarr = queue.get()

    print('original array:', arr)
    # device array is modified by child process
    print('device array:', darr.copy_to_host())
    print('returned host array', hostarr)
github fbpic / fbpic / fbpic / fields / spectral_transform / cuda_methods.py View on Github external
@cuda.jit
def cuda_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) :
    """
    Combine the arrays buffer_r and buffer_t to produce the
    arrays buffer_p and buffer_m, according to the rules of
    the Fourier-Hankel decomposition (see associated paper)
    """
    # Set up cuda grid
    iz, ir = cuda.grid(2)

    if (iz < buffer_r.shape[0]) and (ir < buffer_r.shape[1]) :
        # Use intermediate variables, as the arrays
        # buffer_r and buffer_t may actually point to the same
        # object as buffer_p and buffer_m, for economy of memory
        value_r = buffer_r[iz, ir]
        value_t = buffer_t[iz, ir]
        # Combine the values
github fbpic / fbpic / fbpic / particles / elementary_process / ionization / cuda_methods.py View on Github external
ux, uy, uz, Ex, Ey, Ez, Bx, By, Bz, w, w_times_level ):
    """
    For each ion macroparticle, decide whether it is going to
    be further ionized during this timestep, based on the ADK rate.

    Increment the elements in `ionization_level` accordingly, and update the
    `w_times_level` of the ions to take into account the change in level
    of the corresponding macroparticle.

    For the purpose of counting and creating the corresponding electrons,
    `is_ionized` (one element per macroparticle) is set to 1 at the position
    of the ionized ions, and `n_ionized` (one element per batch) counts
    the total number of ionized particles in the current batch.
    """
    # Loop over batches of particles
    i_batch = cuda.grid(1)
    if i_batch < N_batch:

        # Set the count of ionized particles in the batch to 0
        n_ionized[i_batch] = 0

        # Loop through the batch
        N_max = min( (i_batch+1)*batch_size, Ntot )
        for ip in range( i_batch*batch_size, N_max ):

            # Skip the ionization routine, if the maximal ionization level
            # has already been reached for this macroparticle
            level = ionization_level[ip]
            if level >= level_max:
                is_ionized[ip] = 0
                continue
github numba / numba / numba / cuda / kernels / transpose.py View on Github external
This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    """

    # prefer `a`'s stream if
    stream = getattr(a, 'stream', 0)

    if not b:
        cols, rows = a.shape
        strides = a.dtype.itemsize * cols, a.dtype.itemsize
        b = cuda.cudadrv.devicearray.DeviceNDArray(
            (rows, cols),
            strides,
            dtype=a.dtype,
            stream=stream)

    dt=nps.from_dtype(a.dtype)

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK
    # we need to factor available threads into x and y axis
    tile_width = int(math.pow(2, math.log(tpb, 2)/2))
    tile_height = int(tpb / tile_width)

    tile_shape=(tile_height, tile_width + 1)

    @cuda.jit
    def kernel(input, output):
github rapidsai / cuml / python / cuml / dask / common / utils.py View on Github external
def select_device(dev, close=True):
    """
    Use numbas numba to select the given device, optionally
    closing and opening up a new cuda context if it fails.
    :param dev: int device to select
    :param close: bool close the cuda context and create new one?
    """
    if numba.cuda.get_current_device().id != dev:
        logging.warn("Selecting device " + str(dev))
        if close:
            numba.cuda.close()
        numba.cuda.select_device(dev)
        if dev != numba.cuda.get_current_device().id:
            logging.warn("Current device " +
                         str(numba.cuda.get_current_device()) +
                         " does not match expected " + str(dev))
github peng-cao / mripy / fft / nufft_func_cuda.py View on Github external
def build_grid_2d2_cuda( x, y, fntau, tau, nspread ):
    nf1 = fntau.shape[0]
    nf2 = fntau.shape[1]
    hx = 2 * np.pi / nf1
    hy = 2 * np.pi / nf2
    c = np.zeros(x.shape,dtype = fntau.dtype)
    device = cuda.get_current_device()
    n = x.shape[0] #number of kernels in the computing
    tpb = device.WARP_SIZE
    bpg = int(np.ceil(float(n)/tpb))
    gaussker_2d2_cuda[bpg, tpb](x, y, c, hx, hy, nf1, nf2, nspread, tau, fntau )
    return c/(nf1*nf2)