Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype,
timing=False):
if ctx_factory:
ctx = ctx_factory()
else:
ctx = ps.choose_device_and_make_context()
queue = cl.CommandQueue(ctx)
rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
L = (3, 5, 7)
dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape))
dk = tuple(2 * np.pi / Li for Li in L)
if h == 0:
def get_evals_2(k, dx):
return - k**2
derivs = ps.SpectralCollocator(fft, dk)
else:
from pystella.derivs import SecondCenteredDifference
get_evals_2 = SecondCenteredDifference(h).get_eigenvalues
def make_gpu_integrator(p, eps, A):
ps = [p] * 4
q_unmapped = tensor_gauss(ps)
code = open('toy_kernel.cl', 'r').read()
import pyopencl as cl
import pyopencl.array
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, code).build()
# mf = cl.mem_flags
# a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
# b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
gpu_qx = cl.array.to_device(queue, q_unmapped[0].flatten().astype(np.float32))
gpu_qw = cl.array.to_device(queue, q_unmapped[1].astype(np.float32))
def integrator(mins, maxs):
print(mins.shape[0])
block = (32, 1, 1)
remaining = mins.shape[0] % block[0]
grid_main = (mins.shape[0] // block[0], 1, 1)
grid_rem = (remaining, 1, 1)
out = np.empty(mins.shape[0]).astype(np.float32)
self.geometry_matrix_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=geometry_matrix)
if copy_column_major:
geometry_matric_col_maj = geometry_matrix.flatten(order='F')
self.geometry_matric_col_maj_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=geometry_matric_col_maj)
else:
self.geometry_matric_col_maj_device = None
if laplacian_matrix is not None:
laplacian_matrix = laplacian_matrix.flatten(order='F').astype(np.float32)
self.laplacian_matrix_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=laplacian_matrix)
else:
self.laplacian_matrix_device = None
self.cell_ray_densities_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cell_ray_densities)
self.ray_lengths_device = cl.Buffer(self.cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=ray_lengths)
grad_penalty = np.zeros(self.n_sources, dtype=np.float32)
self.grad_penalty_device = cl.Buffer(self.cl_context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=grad_penalty)
self.solution_device = cl.Buffer(self.cl_context, mf.READ_WRITE, cell_ray_densities.nbytes)
self.detectors_device = cl.Buffer(self.cl_context, mf.READ_ONLY, ray_lengths.nbytes)
self.y_hat_device = cl.Buffer(self.cl_context, mf.READ_WRITE, ray_lengths.nbytes)
# calculating global and local work sizes
nrem = self.n_sources % block_size
gws_sources_x = self.n_sources + bool(nrem) * (block_size - nrem)
mrem = self.m_detectors % block_size
gws_detectors_x = self.m_detectors + bool(mrem) * (block_size - mrem)
mrem_rm = self.m_detectors % block_size_row_maj
gws_detectors_row_maj_x = self.m_detectors + bool(mrem_rm) * (block_size - mrem_rm)
if use_atomic:
gws_sources_row_maj_y = self.n_sources // steps_per_thread_row_maj + bool(self.n_sources % steps_per_thread_row_maj)
gws_sources_y = self.n_sources // steps_per_thread + bool(self.n_sources % steps_per_thread)
gws_detectors_y = self.m_detectors // steps_per_thread + bool(self.m_detectors % steps_per_thread)
else:
gws_sources_row_maj_y = gws_sources_y = gws_detectors_y = 1
if (max_elements % (cta_size * 4)) == 0:
num_blocks = max_elements // (cta_size * 4)
else:
num_blocks = max_elements // (cta_size * 4) + 1
self.d_temp_keys = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * max_elements)
self.d_temp_values = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * max_elements)
self.d_counters = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * self.WARP_SIZE * num_blocks)
self.d_counters_sum = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * self.WARP_SIZE * num_blocks)
self.d_block_offsets = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.dtype_size * self.WARP_SIZE * num_blocks)
numscan = max_elements//2//cta_size*16
if numscan >= self.MIN_LARGE_ARRAY_SIZE:
#MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE 1024
self.scan_buffer = cl.Buffer(self.ctx, mf.READ_WRITE, size = self.dtype_size * numscan // 1024)
import pyopencl.array as cla
import pyopencl.clrandom as clr
import pystella as ps
# set parameters
grid_shape = (128, 128, 128)
proc_shape = (1, 1, 1)
rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
halo_shape = 1
dtype = 'float64'
dx = tuple(10 / Ni for Ni in grid_shape)
dt = min(dx) / 10
# create pyopencl context, queue, and halo-sharer
ctx = ps.choose_device_and_make_context()
queue = cl.CommandQueue(ctx)
decomp = ps.DomainDecomposition(proc_shape, halo_shape, rank_shape)
# initialize arrays with random data
f = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype)
dfdt = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype)
lap_f = cla.zeros(queue, rank_shape, dtype)
# define system of equations
f_ = ps.DynamicField('f', offset='h') # don't overwrite f
rhs_dict = {
f_: f_.dot, # df/dt = \dot{f}
f_.dot: f_.lap # d\dot{f}/dt = \nabla^2 f
}
# create time-stepping and derivative-computing kernels
stepper = ps.LowStorageRK54(rhs_dict, dt=dt, halo_shape=halo_shape)
print('WG Max Size: ' + str(wg_max_size))
print('Num groups: ' + str(num_groups))
print('Local mem size: ' + str(dev.local_mem_size))
# Print the preferred/native floatN lengths (which is optimal for the compiler/hardware, respectively)
# Vectorization can still yield higher throughput even if preferred/native is 1, due to better use of memory bandwidth
print('Preferred floatN size: ' + str(dev.preferred_vector_width_float))
print('Preferred floatN size: ' + str(dev.native_vector_width_float))
# Data and device buffers
data = np.arange(start=0, stop=ARRAY_SIZE, dtype=np.float32)
result = np.zeros(shape=(1,), dtype=np.float32)
data_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
sum_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=np.dtype(np.float32).itemsize)
partial_sums = cl.LocalMemory(wg_max_size * np.dtype(np.float32).itemsize * VECTOR_LENGTH)
# Execute kernels
local_size = wg_max_size
global_size = ARRAY_SIZE // VECTOR_LENGTH
start_event = prog.reduction_vector(queue, (global_size,), (local_size,), data_buffer, partial_sums)
print('\nGlobal size: ' + str(global_size))
# There is some overhead involved with spawning a new kernel (code caching)
# A good rule of thumb is therefore to create the kernel object outside of loops
# Ref: https://lists.tiker.net/pipermail/pyopencl/2016-February/002107.html
kernel_reduction_vector = prog.reduction_vector
# Perform successive stages of reduction
while global_size // local_size > local_size:
# data_big = numpy.zeros((self.npt_height, self.npt_width), dtype=numpy.float32) + dummy
# data_big[:data.shape[0], :] = data
# self.cl_mem["input_data"].set(data_big)
# else:
if isinstance(data, pyopencl.array.Array):
evt = pyopencl.enqueue(data.queue, self.cl_mem["input_data"].data, data.data)
events.append(EventDescription("copy", evt))
else:
self.cl_mem["input_data"].set(data)
ws = self.npt_width // 8
if self.block_size < ws:
raise RuntimeError("Requested a workgoup size of %s, maximum is %s" % (ws, self.block_size))
kargs = self.cl_kernel_args["bsort_horizontal"]
local_mem = kargs["l_data"]
if not local_mem or local_mem.size < ws * 32:
kargs["l_data"] = pyopencl.LocalMemory(ws * 32) # 2float4 = 2*4*4 bytes per workgroup size
evt = self.kernels.bsort_horizontal(self.queue, (self.npt_height, ws), (1, ws), *kargs.values())
events.append(EventDescription("bsort_horizontal", evt))
if self.profile:
with self.sem:
self.events += events
return self.cl_mem["input_data"]
numpy.uint32(integ.lut_size),
#lut_idx_buf,
#lut_coef_buf,
lut_bufT,
numpy.int32(0),
numpy.float32(0),
numpy.float32(0),
outData_buf,
outCount_buf,
outMerge_buf)
t4 = time.time()
program.lut_integrate_lutT(q, (bins,), (16,), *args_bufT)
b = numpy.empty(bins, dtype=numpy.float32)
c = numpy.empty(bins, dtype=numpy.float32)
d = numpy.empty(bins, dtype=numpy.float32)
pyopencl.enqueue_copy(q, c, outData_buf)
pyopencl.enqueue_copy(q, d, outCount_buf)
pyopencl.enqueue_copy(q, b, outMerge_buf).wait()
t5 = time.time()
pylab.plot(a, b, label="OpenCL_imageT")
print "OpenCL speed-up: %s setup: %.2fms \texec: %.2fms" % (0.001 * ref_time / (t5 - t3), 1000 * (t4 - t3), 1000 * (t5 - t4))
print abs(ra - a).max(), abs(rb - b).max(), abs(rc - c).max(), abs(rd - d).max()
for j in list_size:
st = time.time()
program.lut_integrate_lutT(q, (bins,), (j,), * args_bufT)
pyopencl.enqueue_copy(q, b, outMerge_buf).wait()
print("Size: %s \ttime: %.2fms" % (j, 1000 * (time.time() - st)))
#plot(ee)
#pylab.plot(a, b, label="OpenCL")
pylab.legend()
try:
knl(queue, a.shape, None, a_buf, 2, 3)
assert False, "PyOpenCL should not accept bare Python types as arguments"
except cl.LogicError:
pass
try:
prg.mult(queue, a.shape, None, a_buf, float(2), 3)
assert False, "PyOpenCL should not accept bare Python types as arguments"
except cl.LogicError:
pass
prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))
a_result = np.empty_like(a)
cl.enqueue_copy(queue, a_buf, a_result).wait()
if not(USE_CPP_SIFT) and (100 < keypoints_end-keypoints_start): print "NOTE: Python implementation of descriptors is slow. Do not handle more than 100 keypoints, or grab a coffee..."
if (USE_CPU):
print "Using CPU-optimized kernels"
wg = 1,
shape = keypoints.shape[0]*wg[0],
else:
# wg = (8, 8, 8)
# shape = int(keypoints.shape[0]*wg[0]), 8, 8
wg = (8, 4, 4)
shape = int(keypoints.shape[0]*wg[0]), 4, 4
gpu_keypoints = pyopencl.array.to_device(queue, keypoints)
#NOTE: for the following line, use pyopencl.array.empty instead of pyopencl.array.zeros if the keypoints are compacted
gpu_descriptors = pyopencl.array.zeros(queue, (keypoints_end - keypoints_start, 128), dtype=numpy.uint8, order="C")
gpu_grad = pyopencl.array.to_device(queue, grad)
gpu_ori = pyopencl.array.to_device(queue, ori)
keypoints_start, keypoints_end = numpy.int32(keypoints_start), numpy.int32(keypoints_end)
grad_height, grad_width = numpy.int32(grad.shape)
counter = pyopencl.array.to_device(queue, keypoints_end)
t0 = time.time()
k1 = self.program.descriptor(queue, shape, wg,
gpu_keypoints.data, gpu_descriptors.data, gpu_grad.data, gpu_ori.data, numpy.int32(octsize),
keypoints_start, counter.data, grad_width, grad_height)
res = gpu_descriptors.get()
t1 = time.time()
if (USE_CPP_SIFT):
import feature
sc = feature.SiftAlignment()