Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print('WG Max Size: ' + str(wg_max_size))
print('Num groups: ' + str(num_groups))
print('Local mem size: ' + str(dev.local_mem_size))
# Print the preferred/native floatN lengths (which is optimal for the compiler/hardware, respectively)
# Vectorization can still yield higher throughput even if preferred/native is 1, due to better use of memory bandwidth
print('Preferred floatN size: ' + str(dev.preferred_vector_width_float))
print('Preferred floatN size: ' + str(dev.native_vector_width_float))
# Data and device buffers
data = np.arange(start=0, stop=ARRAY_SIZE, dtype=np.float32)
result = np.zeros(shape=(1,), dtype=np.float32)
data_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
sum_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=np.dtype(np.float32).itemsize)
partial_sums = cl.LocalMemory(wg_max_size * np.dtype(np.float32).itemsize * VECTOR_LENGTH)
# Execute kernels
local_size = wg_max_size
global_size = ARRAY_SIZE // VECTOR_LENGTH
start_event = prog.reduction_vector(queue, (global_size,), (local_size,), data_buffer, partial_sums)
print('\nGlobal size: ' + str(global_size))
# There is some overhead involved with spawning a new kernel (code caching)
# A good rule of thumb is therefore to create the kernel object outside of loops
# Ref: https://lists.tiker.net/pipermail/pyopencl/2016-February/002107.html
kernel_reduction_vector = prog.reduction_vector
# Perform successive stages of reduction
while global_size // local_size > local_size:
def apply(self, *new_arrays):
"""
executes kernel against arguments previously bound to self via bind method
new arguments can be supplied in order to avoid re-generation of the kernel for repeated use,
but they MUST match the type and shape of the initially bound arguments
"""
if new_arrays:
self.arrays = new_arrays
mf = cl.mem_flags
res_np = np.zeros( shape=self.arrays[0].shape, dtype=self.return_typ )
buffs = [ cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=arr )
for arr in self.arrays ]
# results:
buffs.append( cl.Buffer(self.ctx, mf.WRITE_ONLY, res_np.nbytes) )
# run!
self.prog.sum(self.queue, self.arrays[0].shape, None, *buffs)
cl.enqueue_copy( self.queue, res_np, buffs[-1] )
return res_np.copy()
def gen_rgb_to_yuv():
global context
from xpra.codecs.csc_opencl.opencl_kernels import gen_rgb_to_yuv_kernels, rgb_mode_to_indexes, indexes_to_rgb_mode
#for RGB to YUV support we need to be able to handle the channel_order,
#with READ_ONLY and both with COPY_HOST_PTR and USE_HOST_PTR since we
#do not know in advance which one we can use..
RGB_to_YUV_KERNELS = {}
sif = pyopencl.get_supported_image_formats(context, mem_flags.WRITE_ONLY, pyopencl.mem_object_type.IMAGE2D)
sif_copy = pyopencl.get_supported_image_formats(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, pyopencl.mem_object_type.IMAGE2D)
debug("get_supported_image_formats(READ_ONLY | COPY_HOST_PTR, IMAGE2D)=%s", sif)
sif_use = pyopencl.get_supported_image_formats(context, mem_flags.READ_ONLY | mem_flags.USE_HOST_PTR, pyopencl.mem_object_type.IMAGE2D)
debug("get_supported_image_formats(READ_ONLY | USE_HOST_PTR, IMAGE2D)=%s", sif)
if not has_image_format(sif_copy, pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8) or \
not has_image_format(sif_use, pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8):
log.error("cannot convert to YUV without support for READ_ONLY R channel with both COPY_HOST_PTR and USE_HOST_PTR")
return {}
missing = []
found_rgb = set()
def add_rgb_to_yuv(src_rgb_mode, kernel_rgb_mode, upload_rgb_mode, channel_order):
debug("add_rgb_to_yuv%s", (src_rgb_mode, kernel_rgb_mode, upload_rgb_mode, CHANNEL_ORDER_TO_STR.get(channel_order)))
kernels = gen_rgb_to_yuv_kernels(kernel_rgb_mode)
#debug("kernels(%s)=%s", rgb_mode, kernels)
for key, k_def in kernels.items():
ksrc, dst = key
assert ksrc==kernel_rgb_mode
kname, ksrc = k_def
RGB_to_YUV_KERNELS[(src_rgb_mode, dst)] = (kname, upload_rgb_mode, channel_order, ksrc)
divs = get_subsampling_divs(self.src_format)
wwidth = dimdiv(self.dst_width, max(x_div for x_div, _ in divs))
wheight = dimdiv(self.dst_height, max(y_div for _, y_div in divs))
globalWorkSize, localWorkSize = self.get_work_sizes(wwidth, wheight)
kernelargs = [self.queue, globalWorkSize, localWorkSize]
iformat = pyopencl.ImageFormat(pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8)
input_images = []
for i in range(3):
_, y_div = divs[i]
plane = pixels[i]
if type(plane)==str:
flags = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR
else:
flags = mem_flags.READ_ONLY | mem_flags.USE_HOST_PTR
shape = strides[i], self.src_height/y_div
iimage = pyopencl.Image(context, flags, iformat, shape=shape, hostbuf=plane)
input_images.append(iimage)
#output image:
oformat = pyopencl.ImageFormat(self.channel_order, pyopencl.channel_type.UNORM_INT8)
oimage = pyopencl.Image(context, mem_flags.WRITE_ONLY, oformat, shape=(self.dst_width, self.dst_height))
kernelargs += input_images + [numpy.int32(self.src_width), numpy.int32(self.src_height),
numpy.int32(self.dst_width), numpy.int32(self.dst_height),
self.sampler, oimage]
kstart = time.time()
debug("convert_image(%s) calling %s%s after upload took %.1fms",
image, self.kernel_function_name, tuple(kernelargs), 1000.0*(kstart-start))
self.kernel_function(*kernelargs)
hostbuf=self._current_chain_position)
data_buffers.append(current_chain_position_buffer)
readout_items.append([current_chain_position_buffer, self._current_chain_position])
proposal_buffer = cl.Buffer(self._cl_context,
cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR,
hostbuf=self._proposal_state)
data_buffers.append(proposal_buffer)
readout_items.append([proposal_buffer, self._proposal_state])
mcmc_state_buffers = {}
for mcmc_state_element in sorted(self._mh_state_dict):
host_array = self._mh_state_dict[mcmc_state_element]['data']
buffer = cl.Buffer(self._cl_context,
cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR,
hostbuf=host_array)
mcmc_state_buffers[mcmc_state_element] = buffer
data_buffers.append(buffer)
readout_items.append([buffer, host_array])
data_buffers.append(cl.LocalMemory(workgroup_size * np.dtype('double').itemsize))
data_buffers.extend(self._data_struct_manager.get_kernel_inputs(self._cl_context, workgroup_size))
return data_buffers, readout_items
def _create_buffers(self):
all_buffers = []
parameters_buffer = cl.Buffer(self._cl_run_context.context,
cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR,
hostbuf=self._parameters)
all_buffers.append(parameters_buffer)
for data in self._model.get_data():
all_buffers.append(cl.Buffer(self._cl_run_context.context,
cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data))
return all_buffers, parameters_buffer
# Use OpenCL To Add Two Random Arrays (This Way Shows Details)
import pyopencl as cl # Import the OpenCL GPU computing API
import numpy as np # Import Np number tools
platform = cl.get_platforms()[0] # Select the first platform [0]
device = platform.get_devices()[0] # Select the first device on this platform [0]
context = cl.Context([device]) # Create a context with your device
queue = cl.CommandQueue(context) # Create a command queue with your context
np_a = np.random.rand(50000).astype(np.float32) # Create a random np array
np_b = np.random.rand(50000).astype(np.float32) # Create a random np array
np_c = np.empty_like(np_a) # Create an empty destination array
cl_a = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=np_a)
cl_b = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=np_b)
cl_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, np_c.nbytes)
# Create three buffers (plans for areas of memory on the device)
kernel = """__kernel void sum(__global float* a, __global float* b, __global float* c)
{
int i = get_global_id(0);
c[i] = a[i] + b[i];
}""" # Create a kernel (a string containing C-like OpenCL device code)
program = cl.Program(context, kernel).build()
# Compile the kernel code into an executable OpenCL program
program.sum(queue, np_a.shape, None, cl_a, cl_b, cl_c)
# Enqueue the program for execution, causing data to be copied to the device
# - queue: the command queue the program will be sent to
def _create_buffers(self):
objectives_buffer = cl.Buffer(self._cl_run_context.context,
cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR,
hostbuf=self._objectives)
all_buffers = [cl.Buffer(self._cl_run_context.context,
cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR,
hostbuf=self._parameters),
objectives_buffer]
for data in [self._data_info[key] for key in sorted(self._data_info)]:
all_buffers.append(cl.Buffer(self._cl_run_context.context,
cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data.get_data()))
return all_buffers, objectives_buffer
devices = [platform.get_devices()[device]]
self.ctx = cl.Context(devices, properties)
else:
self.ctx = cl.create_some_context()
self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
self.device = self.queue.device
print '#Device: {0}'.format(self.device.name)
print '#Memory size: {0} KiB'.format(self.device.global_mem_size / 1024)
print '#Maximum buffer size: {0} KiB'.format(self.device.max_mem_alloc_size / 1024)
f = open('kernels.cl', 'r')
fstr = "".join(f.readlines())
self.prg = cl.Program(self.ctx, fstr).build()
self.in_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, max_mem_size)
self.out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, max_mem_size)
self.local_threads = local_threads
self.global_threads = global_threads
self.max_mem_size = max_mem_size
def _create_buffers(self):
evaluations_buffer = cl.Buffer(self._cl_context,
cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR,
hostbuf=self._evaluations)
all_buffers = [cl.Buffer(self._cl_context,
cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR,
hostbuf=self._parameters),
evaluations_buffer]
all_buffers.extend(self._data_struct_manager.get_kernel_inputs(self._cl_context, 1))
return all_buffers, evaluations_buffer