How to use pyopencl - 10 common examples

To help you get started, we’ve selected a few pyopencl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github zachjweiner / pystella / test / test_poisson.py View on Github external
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype,
                          timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)

    L = (3, 5, 7)
    dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape))
    dk = tuple(2 * np.pi / Li for Li in L)

    if h == 0:
        def get_evals_2(k, dx):
            return - k**2

        derivs = ps.SpectralCollocator(fft, dk)
    else:
        from pystella.derivs import SecondCenteredDifference
        get_evals_2 = SecondCenteredDifference(h).get_eigenvalues
github tbenthompson / tectosaur / tables / test_ndadapt.py View on Github external
def make_gpu_integrator(p, eps, A):
    ps = [p] * 4
    q_unmapped = tensor_gauss(ps)

    code = open('toy_kernel.cl', 'r').read()
    import pyopencl as cl
    import pyopencl.array
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    prg = cl.Program(ctx, code).build()

    # mf = cl.mem_flags
    # a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
    # b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

    gpu_qx = cl.array.to_device(queue, q_unmapped[0].flatten().astype(np.float32))
    gpu_qw = cl.array.to_device(queue, q_unmapped[1].astype(np.float32))

    def integrator(mins, maxs):
        print(mins.shape[0])
        block = (32, 1, 1)
        remaining = mins.shape[0] % block[0]
        grid_main = (mins.shape[0] // block[0], 1, 1)
        grid_rem = (remaining, 1, 1)

        out = np.empty(mins.shape[0]).astype(np.float32)
github cherab / core / cherab / tools / inversions / opencl / sart_opencl.py View on Github external
self.geometry_matrix_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=geometry_matrix)
        if copy_column_major:
            geometry_matric_col_maj = geometry_matrix.flatten(order='F')
            self.geometry_matric_col_maj_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=geometry_matric_col_maj)
        else:
            self.geometry_matric_col_maj_device = None
        if laplacian_matrix is not None:
            laplacian_matrix = laplacian_matrix.flatten(order='F').astype(np.float32)
            self.laplacian_matrix_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=laplacian_matrix)
        else:
            self.laplacian_matrix_device = None
        self.cell_ray_densities_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cell_ray_densities)
        self.ray_lengths_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=ray_lengths)
        grad_penalty = np.zeros(self.n_sources, dtype=np.float32)
        self.grad_penalty_device = cl.Buffer(self.cl_context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=grad_penalty)
        self.solution_device = cl.Buffer(self.cl_context, mf.READ_WRITE, cell_ray_densities.nbytes)
        self.detectors_device = cl.Buffer(self.cl_context, mf.READ_ONLY, ray_lengths.nbytes)
        self.y_hat_device = cl.Buffer(self.cl_context, mf.READ_WRITE, ray_lengths.nbytes)

        # calculating global and local work sizes
        nrem = self.n_sources % block_size
        gws_sources_x = self.n_sources + bool(nrem) * (block_size - nrem)
        mrem = self.m_detectors % block_size
        gws_detectors_x = self.m_detectors + bool(mrem) * (block_size - mrem)
        mrem_rm = self.m_detectors % block_size_row_maj
        gws_detectors_row_maj_x = self.m_detectors + bool(mrem_rm) * (block_size - mrem_rm)
        if use_atomic:
            gws_sources_row_maj_y = self.n_sources // steps_per_thread_row_maj + bool(self.n_sources % steps_per_thread_row_maj)
            gws_sources_y = self.n_sources // steps_per_thread + bool(self.n_sources % steps_per_thread)
            gws_detectors_y = self.m_detectors // steps_per_thread + bool(self.m_detectors % steps_per_thread)
        else:
            gws_sources_row_maj_y = gws_sources_y = gws_detectors_y = 1
github benma / pysph / src / sph / radix_sort / radix_sort.py View on Github external
if (max_elements % (cta_size * 4)) == 0:
            num_blocks = max_elements // (cta_size * 4)
        else:
            num_blocks = max_elements // (cta_size * 4) + 1

        self.d_temp_keys = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * max_elements)
        self.d_temp_values = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * max_elements)

        self.d_counters = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * self.WARP_SIZE * num_blocks)
        self.d_counters_sum = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * self.WARP_SIZE * num_blocks)
        self.d_block_offsets = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * self.WARP_SIZE * num_blocks)

        numscan = max_elements//2//cta_size*16
        if numscan >= self.MIN_LARGE_ARRAY_SIZE:
            #MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE 1024
            self.scan_buffer = cl.Buffer(self.ctx, mf.READ_WRITE, size = self.dtype_size * numscan // 1024)
github zachjweiner / pystella / examples / wave_equation.py View on Github external
import pyopencl.array as cla
import pyopencl.clrandom as clr
import pystella as ps

# set parameters
grid_shape = (128, 128, 128)
proc_shape = (1, 1, 1)
rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
halo_shape = 1
dtype = 'float64'
dx = tuple(10 / Ni for Ni in grid_shape)
dt = min(dx) / 10

# create pyopencl context, queue, and halo-sharer
ctx = ps.choose_device_and_make_context()
queue = cl.CommandQueue(ctx)
decomp = ps.DomainDecomposition(proc_shape, halo_shape, rank_shape)

# initialize arrays with random data
f = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype)
dfdt = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype)
lap_f = cla.zeros(queue, rank_shape, dtype)

# define system of equations
f_ = ps.DynamicField('f', offset='h')  # don't overwrite f
rhs_dict = {
    f_: f_.dot,  # df/dt = \dot{f}
    f_.dot: f_.lap  # d\dot{f}/dt = \nabla^2 f
}

# create time-stepping and derivative-computing kernels
stepper = ps.LowStorageRK54(rhs_dict, dt=dt, halo_shape=halo_shape)
github oysstu / pyopencl-in-action / ch10 / reduction_complete.py View on Github external
print('WG Max Size: ' + str(wg_max_size))
print('Num groups: ' + str(num_groups))
print('Local mem size: ' + str(dev.local_mem_size))

# Print the preferred/native floatN lengths (which is optimal for the compiler/hardware, respectively)
# Vectorization can still yield higher throughput even if preferred/native is 1, due to better use of memory bandwidth
print('Preferred floatN size: ' + str(dev.preferred_vector_width_float))
print('Preferred floatN size: ' + str(dev.native_vector_width_float))

# Data and device buffers
data = np.arange(start=0, stop=ARRAY_SIZE, dtype=np.float32)
result = np.zeros(shape=(1,), dtype=np.float32)

data_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
sum_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=np.dtype(np.float32).itemsize)

partial_sums = cl.LocalMemory(wg_max_size * np.dtype(np.float32).itemsize * VECTOR_LENGTH)

# Execute kernels
local_size = wg_max_size
global_size = ARRAY_SIZE // VECTOR_LENGTH
start_event = prog.reduction_vector(queue, (global_size,), (local_size,), data_buffer, partial_sums)
print('\nGlobal size: ' + str(global_size))

# There is some overhead involved with spawning a new kernel (code caching)
# A good rule of thumb is therefore to create the kernel object outside of loops
# Ref: https://lists.tiker.net/pipermail/pyopencl/2016-February/002107.html
kernel_reduction_vector = prog.reduction_vector

# Perform successive stages of reduction
while global_size // local_size > local_size:
github silx-kit / pyFAI / pyFAI / opencl / sort.py View on Github external
#                 data_big = numpy.zeros((self.npt_height, self.npt_width), dtype=numpy.float32) + dummy
#                 data_big[:data.shape[0], :] = data
#                 self.cl_mem["input_data"].set(data_big)
#         else:
        if isinstance(data, pyopencl.array.Array):
            evt = pyopencl.enqueue(data.queue, self.cl_mem["input_data"].data, data.data)
            events.append(EventDescription("copy", evt))
        else:
            self.cl_mem["input_data"].set(data)
        ws = self.npt_width // 8
        if self.block_size < ws:
            raise RuntimeError("Requested a workgoup size of %s, maximum is %s" % (ws, self.block_size))
        kargs = self.cl_kernel_args["bsort_horizontal"]
        local_mem = kargs["l_data"]
        if not local_mem or local_mem.size < ws * 32:
            kargs["l_data"] = pyopencl.LocalMemory(ws * 32)  # 2float4 = 2*4*4 bytes per workgroup size
        evt = self.kernels.bsort_horizontal(self.queue, (self.npt_height, ws), (1, ws), *kargs.values())
        events.append(EventDescription("bsort_horizontal", evt))

        if self.profile:
            with self.sem:
                self.events += events
        return self.cl_mem["input_data"]
github silx-kit / pyFAI / src / test_splitBBox.py View on Github external
numpy.uint32(integ.lut_size),
                       #lut_idx_buf,
                       #lut_coef_buf,
                       lut_bufT,
                       numpy.int32(0),
                       numpy.float32(0),
                       numpy.float32(0),
                       outData_buf,
                       outCount_buf,
                       outMerge_buf)
t4 = time.time()
program.lut_integrate_lutT(q, (bins,), (16,), *args_bufT)
b = numpy.empty(bins, dtype=numpy.float32)
c = numpy.empty(bins, dtype=numpy.float32)
d = numpy.empty(bins, dtype=numpy.float32)
pyopencl.enqueue_copy(q, c, outData_buf)
pyopencl.enqueue_copy(q, d, outCount_buf)
pyopencl.enqueue_copy(q, b, outMerge_buf).wait()
t5 = time.time()
pylab.plot(a, b, label="OpenCL_imageT")
print "OpenCL speed-up: %s setup: %.2fms \texec: %.2fms" % (0.001 * ref_time / (t5 - t3), 1000 * (t4 - t3), 1000 * (t5 - t4))
print abs(ra - a).max(), abs(rb - b).max(), abs(rc - c).max(), abs(rd - d).max()
for j in list_size:
    st = time.time()
    program.lut_integrate_lutT(q, (bins,), (j,), * args_bufT)
    pyopencl.enqueue_copy(q, b, outMerge_buf).wait()
    print("Size: %s \ttime: %.2fms" % (j, 1000 * (time.time() - st)))


#plot(ee)
#pylab.plot(a, b, label="OpenCL")
pylab.legend()
github inducer / pyopencl / test / test_wrapper.py View on Github external
try:
        knl(queue, a.shape, None, a_buf, 2, 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    try:
        prg.mult(queue, a.shape, None, a_buf, float(2), 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))

    a_result = np.empty_like(a)
    cl.enqueue_copy(queue, a_buf, a_result).wait()
github pierrepaleo / sift_pyocl / test / test_keypoints_old.py View on Github external
if not(USE_CPP_SIFT) and (100 < keypoints_end-keypoints_start): print "NOTE: Python implementation of descriptors is slow. Do not handle more than 100 keypoints, or grab a coffee..."
        
        if (USE_CPU):
            print "Using CPU-optimized kernels"
            wg = 1,
            shape = keypoints.shape[0]*wg[0],
        else:
#            wg = (8, 8, 8)
#            shape = int(keypoints.shape[0]*wg[0]), 8, 8
            wg = (8, 4, 4)
            shape = int(keypoints.shape[0]*wg[0]), 4, 4
                        
        gpu_keypoints = pyopencl.array.to_device(queue, keypoints)
        #NOTE: for the following line, use pyopencl.array.empty instead of pyopencl.array.zeros if the keypoints are compacted
        gpu_descriptors = pyopencl.array.zeros(queue, (keypoints_end - keypoints_start, 128), dtype=numpy.uint8, order="C")
        gpu_grad = pyopencl.array.to_device(queue, grad)
        gpu_ori = pyopencl.array.to_device(queue, ori)

        keypoints_start, keypoints_end = numpy.int32(keypoints_start), numpy.int32(keypoints_end)
        grad_height, grad_width = numpy.int32(grad.shape)
        counter = pyopencl.array.to_device(queue, keypoints_end)
        
        t0 = time.time()
        k1 = self.program.descriptor(queue, shape, wg,
            gpu_keypoints.data, gpu_descriptors.data, gpu_grad.data, gpu_ori.data, numpy.int32(octsize),
            keypoints_start, counter.data, grad_width, grad_height)
        res = gpu_descriptors.get()
        t1 = time.time()

        if (USE_CPP_SIFT):
            import feature
            sc = feature.SiftAlignment()