How to use the cupy.RawKernel function in cupy

To help you get started, we’ve selected a few cupy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rossant / pykilosort / pykilosort / learn.py View on Github external
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))

    # ignore peaks that are smaller than another nearby peak
    cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
    cleanup_heights(
        (1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))

    # add new spikes to 2nd counter
    counter[0] = d_counter[1]
    counter[0] = min(maxFR, counter[0])

    d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
    # d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')

    # update dWU here by adding back to subbed spikes
    extract_snips = cp.RawKernel(code, 'extract_snips')
    extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))

    # QUESTION: why a copy here??
    # if counter[0] > 0:
    #     d_WU1[...] = d_WU[...]

    del (
        d_ftype, d_kkmax, d_err, d_st, d_id, d_st1, d_x, d_kk, d_id1, d_counter,
        d_Params, d_dfilt)
    return d_WU, d_dout
github rossant / pykilosort / pykilosort / learn.py View on Github external
d_wtw = cp.zeros((nt0, nt0, Nfilt), dtype=np.float64, order='F')
    d_dWUb = cp.zeros((nt0, Nchan, Nfilt), dtype=np.float64, order='F')

    tpS = (nt0, int(Nthreads // nt0))
    tpK = (Nrank, int(Nthreads // Nrank))

    blankdWU = cp.RawKernel(code, 'blankdWU')
    blankdWU((Nfilt,), tpS, (d_Params, d_dWU, d_iC, d_iW, d_dWUb))

    # compute dWU * dWU'
    getwtw = cp.RawKernel(code, 'getwtw')
    getwtw((Nfilt,), tpS, (d_Params, d_dWUb, d_wtw))

    # get W by power svd iterations
    getW = cp.RawKernel(code, 'getW')
    getW((Nfilt,), (nt0,), (d_Params, d_wtw, d_W))

    # compute U by W' * dWU
    getU = cp.RawKernel(code, 'getU')
    getU((Nfilt,), tpK, (d_Params, d_dWUb, d_W, d_U))

    # normalize U, get S, get mu, renormalize W
    reNormalize = cp.RawKernel(code, 'reNormalize')
    reNormalize((Nfilt,), (nt0,), (d_Params, d_A, d_B, d_W, d_U, d_mu))

    del d_wtw, d_Params, d_dWUb

    return d_W, d_U, d_mu
github rossant / pykilosort / pykilosort / learn.py View on Github external
Nfilt = int(Params[1])
    nt0 = int(Params[9])

    d_Params = cp.asarray(Params, dtype=np.float64, order='F')

    d_W1 = cp.asarray(W1, dtype=np.float32, order='F')
    d_W2 = cp.asarray(W2, dtype=np.float32, order='F')
    d_UtU = cp.asarray(UtU, dtype=np.float32, order='F')

    d_WtW = cp.zeros((Nfilt, Nfilt, 2 * nt0 - 1), dtype=np.float32, order='F')

    grid = (1 + int(Nfilt // nblock), 1 + int(Nfilt // nblock))
    block = (nblock, nblock)

    crossFilter = cp.RawKernel(code, 'crossFilter')
    crossFilter(grid, block, (d_Params, d_W1, d_W2, d_UtU, d_WtW))

    del d_Params, d_W1, d_W2, d_UtU

    return d_WtW
github rossant / pykilosort / pykilosort / learn.py View on Github external
# sum each template across channels, square, take max
    sumChannels = cp.RawKernel(code, 'sumChannels')
    sumChannels((int(NT / Nthreads),), (Nthreads,), (d_Params, d_dfilt, d_dout, d_kkmax, d_iC))

    # compute the best filter
    bestFilter = cp.RawKernel(code, 'bestFilter')
    bestFilter(
        (int(NT / Nthreads),), (Nthreads,), (d_Params, d_dout, d_err, d_ftype, d_kkmax, d_kk))

    # ignore peaks that are smaller than another nearby peak
    cleanup_spikes = cp.RawKernel(code, 'cleanup_spikes')
    cleanup_spikes(
        (int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))

    # ignore peaks that are smaller than another nearby peak
    cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
    cleanup_heights(
        (1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))

    # add new spikes to 2nd counter
    counter[0] = d_counter[1]
    counter[0] = min(maxFR, counter[0])

    d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
    # d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')

    # update dWU here by adding back to subbed spikes
    extract_snips = cp.RawKernel(code, 'extract_snips')
    extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))

    # QUESTION: why a copy here??
    # if counter[0] > 0:
github rossant / pykilosort / pykilosort / cluster.py View on Github external
# Input GPU arrays.
    d_Params = cp.asarray(Params, dtype=np.float64, order='F')
    d_data = cp.asarray(dataRAW, dtype=np.float32, order='F')
    d_W = cp.asarray(wPCA, dtype=np.float32, order='F')
    d_iC = cp.asarray(iC, dtype=np.int32, order='F')

    # New GPU arrays.
    d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_counter = cp.zeros(1, dtype=np.int32, order='F')

    # filter the data with the temporal templates
    Conv1D = cp.RawKernel(code, 'Conv1D')
    Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dout))

    # get the max of the data
    max1D = cp.RawKernel(code, 'max1D')
    max1D((Nchan,), (Nthreads,), (d_Params, d_dout, d_dmax))

    # take max across nearby channels
    maxChannels = cp.RawKernel(code, 'maxChannels')
    maxChannels(
        (int(NT // Nthreads),), (Nthreads,),
        (d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter))

    # move d_x to the CPU
    minSize = 1
    minSize = min(maxFR, int(d_counter[0]))
github AlphaAtlas / VapourSynth-Super-Resolution-Helper / Scripts / Alpha_CuPy.py View on Github external
# source code of CUDA kernel
    with open(os.path.join(os.path.dirname(Path(__file__).resolve()),'bilateral.cu'), 'r') as f:
        kernel_source_code = f.read()

    kernel_source_code = Template(kernel_source_code)
    kernel_source_code = kernel_source_code.substitute(
        width=w, height=h, sigma_s=-0.5/(sigmaS**2), sigma_r=-0.5/(sigmaR**2), 
        sigma=sigma, snn=snn, half_kernel_size=half_kernel_size)


    if fast:
        kernel = cp.RawKernel(kernel_source_code, 'bilateral', 
            options=('--use_fast_math', ))
    else:
        kernel = cp.RawKernel(kernel_source_code, 'bilateral')

    # create NumPy function
    def bilateral_core(h_img, kernel):
        # h_img must be a 2-D image

        d_img = cp.asarray(h_img)
        d_out = cp.empty_like(d_img)

        kernel(((w + blksize[0] - 1)//blksize[0], (h + blksize[1] - 1)//blksize[1]), blksize, (d_img, d_out))

        h_out = cp.asnumpy(d_out)

        return h_out

    # process
    return mufnp.numpy_process(src, bilateral_core, kernel=kernel)
github rossant / pykilosort / pykilosort / cluster.py View on Github external
# get list of cmaxes for each combination of neuron and filter
    computeCost = cp.RawKernel(code, 'computeCost')
    computeCost(
        (Nfilters,), (1024,), (d_Params, d_uproj, d_mu, d_W, d_iMatch, d_iC, d_call, d_cmax))

    # loop through cmax to find best template
    bestFilter = cp.RawKernel(code, 'bestFilter')
    bestFilter((40,), (256,), (d_Params, d_iMatch, d_iC, d_call, d_cmax, d_id, d_x))

    # average all spikes for same template -- ORIGINAL
    average_snips = cp.RawKernel(code, 'average_snips')
    average_snips(
        (Nfilters,), (NrankPC, NchanNear), (d_Params, d_iC, d_call, d_id, d_uproj, d_cmax, d_dWU))

    count_spikes = cp.RawKernel(code, 'count_spikes')
    count_spikes((7,), (256,), (d_Params, d_id, d_nsp, d_x, d_V))

    del d_Params, d_V

    return d_dWU, d_id, d_x, d_nsp, d_cmax
github rossant / pykilosort / pykilosort / learn.py View on Github external
# update 1st counter from 2nd counter
        d_counter[1] = d_counter[0]

    # compute PC features from reziduals + subtractions
    # TODO: design - let's not use numeric indexing into the Params array. It's much more difficult to read.
    if Params[12] > 0:
        computePCfeatures = cp.RawKernel(code, 'computePCfeatures')
        computePCfeatures(
            (Nfilt,), tpPC,
            (d_Params, d_counter, d_draw, d_st, d_id, d_y,
             d_W, d_U, d_mu, d_iW, d_iC, d_wPCA, d_featPC))

    # update dWU here by adding back to subbed spikes.
    # additional parameter d_idx = array of time sorted indicies
    average_snips = cp.RawKernel(code, 'average_snips')
    average_snips(
        (Nfilt,), tpS,
        (d_Params, d_st, d_id, d_x, d_y, d_counter, d_draw, d_W, d_U, d_dWU, d_nsp, d_mu, d_z))

    if counter[0] < maxFR:
        minSize = counter[0]
    else:
        minSize = maxFR

    del d_counter, d_Params, d_ftype, d_err, d_eloss, d_z, d_dout, d_data

    return (
        d_st[:minSize], d_id[:minSize], d_y[:minSize], d_feat[..., :minSize],
        d_dWU, d_draw, d_nsp, d_featPC[..., :minSize], d_x[:minSize])
github chainer / chainercv / chainercv / utils / bbox / non_maximum_suppression.py View on Github external
def _call_nms_kernel(bbox, thresh):
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel')
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec
github rossant / pykilosort / pykilosort / learn.py View on Github external
d_dout = cp.zeros((NT, Nchan), dtype=np.float32, order='F')
    d_dfilt = cp.zeros((Nrank, NT, Nchan), dtype=np.float32, order='F')
    d_err = cp.zeros(NT, dtype=np.float32, order='F')
    d_kkmax = cp.zeros((NT, Nchan), dtype=np.int32, order='F')
    d_kk = cp.zeros(NT, dtype=np.int32, order='F')
    d_ftype = cp.zeros(NT, dtype=np.int32, order='F')
    d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_x = cp.zeros(maxFR, dtype=np.float32, order='F')
    d_st1 = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_id1 = cp.zeros(maxFR, dtype=np.int32, order='F')

    counter = np.zeros(2, dtype=np.int32, order='F')

    # filter the data with the temporal templates
    Conv1D = cp.RawKernel(code, 'Conv1D')
    Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dfilt))

    # sum each template across channels, square, take max
    sumChannels = cp.RawKernel(code, 'sumChannels')
    sumChannels((int(NT / Nthreads),), (Nthreads,), (d_Params, d_dfilt, d_dout, d_kkmax, d_iC))

    # compute the best filter
    bestFilter = cp.RawKernel(code, 'bestFilter')
    bestFilter(
        (int(NT / Nthreads),), (Nthreads,), (d_Params, d_dout, d_err, d_ftype, d_kkmax, d_kk))

    # ignore peaks that are smaller than another nearby peak
    cleanup_spikes = cp.RawKernel(code, 'cleanup_spikes')
    cleanup_spikes(
        (int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))