Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_elwise_kernel(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
a_gpu = clrand(queue, (50,), np.float32)
b_gpu = clrand(queue, (50,), np.float32)
from pyopencl.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(context,
"float a, float *x, float b, float *y, float *z",
"z[i] = a*x[i] + b*y[i]",
"linear_combination")
c_gpu = cl_array.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
knl = lp.split_dimension(knl, "k", 16, no_slabs=True)
#knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr")
knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")])
knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),])
assert knl.get_problems({})[0] <= 2
kernel_gen = (lp.insert_register_prefetches(knl)
for knl in lp.generate_loop_schedules(knl,
hints=["k_outer", "k_inner_outer", "k_inner_inner"]
))
a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
a_img = cl.image_from_array(ctx, a.get(), 1)
b_img = cl.image_from_array(ctx, b.get(), 1)
c = cl_array.empty_like(a)
refsol = np.dot(a.get(), b.get())
def launcher(kernel, gsize, lsize, check):
evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data,
g_times_l=True)
if check:
check_error(refsol, c.get())
return evt
lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)
if i+1 < len(seg_boundaries):
seg_end = seg_boundaries[i+1]
else:
seg_end = None
if is_exclusive:
result_host[seg_start+1:seg_end] = np.cumsum(
a[seg_start:seg_end][:-1])
result_host[seg_start] = 0
else:
result_host[seg_start:seg_end] = np.cumsum(
a[seg_start:seg_end])
#print "REF", result_host
result_dev = cl_array.empty_like(a_dev)
knl(a_dev, seg_boundary_flags_dev, result_dev)
#print "RES", result_dev
is_correct = (result_dev.get() == result_host).all()
if not is_correct:
diff = result_dev.get() - result_host
print("RES-REF", diff)
print("ERRWHERE", np.where(diff))
print(n, list(seg_boundaries))
assert is_correct
from gc import collect
collect()
print("%d excl:%s done" % (n, is_exclusive))
shape = 4096
# get the context
ctx = pyopencl.Context()
assert(not ctx is None)
queue = pyopencl.CommandQueue(ctx)
max_valid_wg = 1
data = numpy.random.random(shape).astype(numpy.float32)
d_data = pyopencl.array.to_device(queue, data)
d_data_1 = pyopencl.array.zeros_like(d_data) + 1
program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
maxi = int(round(numpy.log2(shape)))
for i in range(maxi):
d_res = pyopencl.array.empty_like(d_data)
wg = 1 << i
try:
evt = program.addition(queue, (shape,), (wg,),
d_data.data, d_data_1.data, d_res.data, numpy.int32(shape))
evt.wait()
except Exception as error:
print("Error on WG=%s: %s"%(wg, error))
program = queue = d_res = d_data_1 = d_data = None
break;
else:
res = d_res.get()
good = numpy.allclose(res, data + 1 )
if good and wg>max_valid_wg:
max_valid_wg = wg
assert ctx is not None
queue = pyopencl.CommandQueue(ctx)
max_valid_wg = 1
data = numpy.random.random(shape).astype(numpy.float32)
d_data = pyopencl.array.to_device(queue, data)
d_data_1 = pyopencl.array.empty_like(d_data)
d_data_1.fill(numpy.float32(1.0))
program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
if fast:
max_valid_wg = program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device)
else:
maxi = int(round(numpy.log2(shape)))
for i in range(maxi + 1):
d_res = pyopencl.array.empty_like(d_data)
wg = 1 << i
try:
evt = program.addition(
queue, (shape,), (wg,),
d_data.data, d_data_1.data, d_res.data, numpy.int32(shape))
evt.wait()
except Exception as error:
logger.info("%s on device %s for WG=%s/%s", error, device.name, wg, shape)
program = queue = d_res = d_data_1 = d_data = None
break
else:
res = d_res.get()
good = numpy.allclose(res, data + 1)
if good:
if wg > max_valid_wg:
max_valid_wg = wg
assert ctx is not None
queue = pyopencl.CommandQueue(ctx)
max_valid_wg = 1
data = numpy.random.random(shape).astype(numpy.float32)
d_data = pyopencl.array.to_device(queue, data)
d_data_1 = pyopencl.array.zeros_like(d_data) + 1
program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
if fast:
max_valid_wg = program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device)
else:
maxi = int(round(numpy.log2(shape)))
for i in range(maxi + 1):
d_res = pyopencl.array.empty_like(d_data)
wg = 1 << i
try:
evt = program.addition(queue, (shape,), (wg,),
d_data.data, d_data_1.data, d_res.data,
numpy.int32(shape))
evt.wait()
except Exception as error:
logger.info("%s on device %s for WG=%s/%s", error, device.name, wg, shape)
program = queue = d_res = d_data_1 = d_data = None
break
else:
res = d_res.get()
good = numpy.allclose(res, data + 1)
if good:
if wg > max_valid_wg:
# Use OpenCL To Add Two Random Arrays (This Way Hides Details)
import pyopencl as cl # Import the OpenCL GPU computing API
import pyopencl.array as pycl_array # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object)
import numpy as np # Import Numpy number tools
context = cl.create_some_context() # Initialize the Context
queue = cl.CommandQueue(context) # Instantiate a Queue
a = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32))
b = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32))
# Create two random pyopencl arrays
c = pycl_array.empty_like(a) # Create an empty pyopencl destination array
program = cl.Program(context, """
__kernel void sum(__global const float *a, __global const float *b, __global float *c)
{
int i = get_global_id(0);
c[i] = a[i] + b[i];
}""").build() # Create the OpenCL program
program.sum(queue, a.shape, None, a.data, b.data, c.data) # Enqueue the program for execution and store the result in c
print("a: {}".format(a))
print("b: {}".format(b))
print("c: {}".format(c))
# Print all three arrays, to show sum() worked
knl = lp.split_iname(knl, "j", ilp*j_inner_split, outer_tag="g.1")
knl = lp.split_iname(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0")
knl = lp.split_iname(knl, "k", 2)
knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"])
assert knl.get_problems({})[0] <= 2
kernel_gen = (lp.insert_register_prefetches(knl)
for knl in lp.generate_loop_schedules(knl))
a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
ran_factor=1, id_factor=5)
b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order,
ran_factor=1, id_factor=5, inc_factor=0)
c = cl_array.empty_like(a)
a_img = cl.image_from_array(ctx, a.get(), 1)
b_img = cl.image_from_array(ctx, b.get(), 1)
def launcher(kernel, gsize, lsize, check):
evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data,
g_times_l=True)
return evt
from pyopencl.characterize import get_fast_inaccurate_build_options
lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
options=get_fast_inaccurate_build_options(ctx.devices[0]))
data = fabio.open(fname).data
raw = cbf.read(fname, only_raw=True)
properties = pyopencl.command_queue_properties.PROFILING_ENABLE
# properties = None
queue = pyopencl.CommandQueue(ctx, properties=properties)
raw_n = numpy.fromstring(raw, dtype="int8")
size = raw_n.size
raw_d = pyopencl.array.to_device(queue, raw_n)
int_d = pyopencl.array.empty(queue, (size,), dtype="int32")
data_d = pyopencl.array.empty(queue, (data.size,), dtype="int32")
tmp1_d = pyopencl.array.zeros_like(data_d)
tmp2_d = pyopencl.array.zeros_like(data_d)
tmp3_d = pyopencl.array.zeros_like(data_d)
lem_d = pyopencl.array.empty_like(data_d)
zero_d = pyopencl.array.zeros(queue, shape=1, dtype="int32")
src = open("sandbox/cbf.cl").read()
prg = pyopencl.Program(ctx, src).build()
for i in range(11):
WG = 1 << i
print("#" * 80)
print("WG: %s" % WG)
la = pyopencl.LocalMemory(4 * WG)
lb = pyopencl.LocalMemory(4 * WG)
lc = pyopencl.LocalMemory(4 * WG)
# ld = pyopencl.LocalMemory(4)
debug1_d = pyopencl.array.zeros(queue, shape=WG, dtype="int32")
debug2_d = pyopencl.array.zeros(queue, shape=WG, dtype="int32")
debug3_d = pyopencl.array.zeros(queue, shape=WG, dtype="int32")
if debug:
box_levels.finish()
level_bl_chunk = box_levels.get()[
level_start_box_nrs[-2]:level_start_box_nrs[-1]]
assert (level_bl_chunk == level).all()
del level_bl_chunk
if debug:
assert (box_srcntgt_starts.get() < nsrcntgts).all()
# }}}
# {{{ renumber particles within split boxes
new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids)
new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids)
particle_renumberer_args = (
common_args
+ (box_has_children, force_split_box,
new_user_srcntgt_ids, new_srcntgt_box_ids))
evt = knl_info.particle_renumberer_kernel(*particle_renumberer_args,
range=slice(nsrcntgts), wait_for=wait_for)
wait_for = [evt]
fin_debug("particle renumbering")
user_srcntgt_ids = new_user_srcntgt_ids
del new_user_srcntgt_ids
srcntgt_box_ids = new_srcntgt_box_ids