Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@memoize
def _splay_backend(n, dev):
# heavily modified from cublas
from pycuda.tools import DeviceData
devdata = DeviceData(dev)
min_threads = devdata.warp_size
max_threads = 128
max_blocks = 4 * devdata.thread_blocks_per_mp \
* dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
if n < min_threads:
block_count = 1
threads_per_block = min_threads
elif n < (max_blocks * min_threads):
block_count = (n + min_threads - 1) // min_threads
threads_per_block = min_threads
@memoize
def _get_transpose_kernel():
mod = SourceModule("""
#define BLOCK_SIZE %(block_size)d
#define A_BLOCK_STRIDE (BLOCK_SIZE*a_width)
#define A_T_BLOCK_STRIDE (BLOCK_SIZE*a_height)
__global__ void transpose(float *A_t, float *A, int a_width, int a_height)
{
// Base indices in A and A_t
int base_idx_a = blockIdx.x*BLOCK_SIZE + blockIdx.y*A_BLOCK_STRIDE;
int base_idx_a_t = blockIdx.y*BLOCK_SIZE + blockIdx.x*A_T_BLOCK_STRIDE;
// Global indices in A and A_t
int glob_idx_a = base_idx_a + threadIdx.x + a_width*threadIdx.y;
int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height*threadIdx.y;
@memoize
def is_bounded(set):
assert set.dim(dim_type.set) == 0
return (set
.move_dims(dim_type.set, 0,
dim_type.param, 0, set.dim(dim_type.param))
.is_bounded())
@memoize
def _get_unique_cids_kernel(ctx):
return GenericScanKernel(
ctx, np.int32, neutral="0",
arguments=r"""int *cids, int *unique_cids_map,
int *unique_cids, int *unique_cids_count""",
input_expr="(i == 0 || cids[i] != cids[i-1])",
scan_expr="a + b",
output_statement=r"""
if (item != prev_item) {
@pytools.memoize
def _cached_eval_expr_with_setup(assignments, expr):
global _kernel_instance
if _kernel_instance is None:
_kernel_instance = MaximaKernel()
return _kernel_instance.clean_eval_expr_with_setup(assignments, expr)
@memoize
def get_run_debug_directory():
def creator(name):
from os import mkdir
mkdir(name)
return name
return make_unique_filesystem_object("run-debug", creator=creator)
@memoize
def select_block_grid_sizes(dev, data_shape, threads_per_block=None):
"""
Determine CUDA block and grid dimensions given device constraints.
Determine the CUDA block and grid dimensions allowed by a GPU
device that are sufficient for processing every element of an
array in a separate thread.
Parameters
----------
d : pycuda.driver.Device
Device object to be used.
data_shape : tuple
Shape of input data array. Must be of length 2.
threads_per_block : int, optional
Number of threads to execute in each block. If this is None,
@memoize(key=kernel_cache_key_args)
def _generate_kernel(self, *args):
if self.func is not None:
arg_types = self.get_type_info_from_args(*args)
helper = AnnotationHelper(self.func, arg_types)
helper.annotate()
self.func = helper.func
return self._generate()
@memoize(key=lambda *args: tuple(args))
def get_scan(inp_f, out_f, dtype, backend):
return Scan(input=inp_f, output=out_f, dtype=dtype,
backend=backend)
@memoize(key=kernel_cache_key_kwargs, use_kwargs=True)
def _generate_kernel(self, **kwargs):
if self.input_func is not None:
arg_types = self.get_type_info_from_kwargs(
self.input_func, **kwargs)
arg_types['return_'] = dtype_to_knowntype(self.dtype)
helper = AnnotationHelper(self.input_func, arg_types)
helper.annotate()
self.input_func = helper.func
if self.output_func is not None:
arg_types = self.get_type_info_from_kwargs(
self.output_func, **kwargs)
helper = AnnotationHelper(self.output_func, arg_types)
helper.annotate()
self.output_func = helper.func