Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@memoize
def _splay_backend(n, dev):
# heavily modified from cublas
from pycuda.tools import DeviceData
devdata = DeviceData(dev)
min_threads = devdata.warp_size
max_threads = 128
max_blocks = 4 * devdata.thread_blocks_per_mp \
* dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
if n < min_threads:
block_count = 1
threads_per_block = min_threads
elif n < (max_blocks * min_threads):
block_count = (n + min_threads - 1) // min_threads
threads_per_block = min_threads
@memoize
def _get_transpose_kernel():
mod = SourceModule("""
#define BLOCK_SIZE %(block_size)d
#define A_BLOCK_STRIDE (BLOCK_SIZE*a_width)
#define A_T_BLOCK_STRIDE (BLOCK_SIZE*a_height)
__global__ void transpose(float *A_t, float *A, int a_width, int a_height)
{
// Base indices in A and A_t
int base_idx_a = blockIdx.x*BLOCK_SIZE + blockIdx.y*A_BLOCK_STRIDE;
int base_idx_a_t = blockIdx.y*BLOCK_SIZE + blockIdx.x*A_T_BLOCK_STRIDE;
// Global indices in A and A_t
int glob_idx_a = base_idx_a + threadIdx.x + a_width*threadIdx.y;
int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height*threadIdx.y;
dint = 1
for i in sorted(rest_shape):
if i * dint > 256 // (dim0_out * dim1_out):
break
else:
dint *= i
# dim_a_out, dim_b_out, d_internal (arbitrary)
block = (dim0_out, dim1_out, dint)
blocksize = dim1_out * dim0_out * dint
sh_mem_size = dint * dim1_in * dim0_in # + ptm.size
grid_size = max(1, (new_size - 1) // blocksize + 1)
grid = (grid_size, 1, 1)
dim_z = pytools.product(self._data.shape[qubit1 + 1:])
dim_y = pytools.product(self._data.shape[qubit0 + 1:qubit1])
dim_rho = new_size # self.data.size
_two_qubit_general_ptm.prepared_call(
grid,
block,
self._data.gpudata,
self._work_data.gpudata,
ptm_gpu.gpudata,
dim0_in, dim1_in,
dim_z,
dim_y,
dim_rho,
shared_size=8 * sh_mem_size)
self._data, self._work_data = self._work_data, self._data
dint = 1
for i in sorted(rest_shape):
if i * dint > 256 // (dim0_out * dim1_out):
break
else:
dint *= i
# dim_a_out, dim_b_out, d_internal (arbitrary)
block = (dim0_out, dim1_out, dint)
blocksize = dim1_out * dim0_out * dint
sh_mem_size = dint * dim1_in * dim0_in # + ptm.size
grid_size = max(1, (new_size - 1) // blocksize + 1)
grid = (grid_size, 1, 1)
dim_z = pytools.product(self._data.shape[qubit1 + 1:])
dim_y = pytools.product(self._data.shape[qubit0 + 1:qubit1])
dim_rho = new_size # self.data.size
_two_qubit_general_ptm.prepared_call(
grid,
block,
self._data.gpudata,
self._work_data.gpudata,
ptm_gpu.gpudata,
dim0_in, dim1_in,
dim_z,
dim_y,
dim_rho,
shared_size=8 * sh_mem_size)
self._data, self._work_data = self._work_data, self._data
@pytools.test.mark_test.opencl
def test_tree(ctx_getter, do_plot=False):
ctx = ctx_getter()
queue = cl.CommandQueue(ctx)
#for dims in [2, 3]:
for dims in [2]:
nparticles = 10000
dtype = np.float64
from pyopencl.clrandom import RanluxGenerator
rng = RanluxGenerator(queue, seed=15)
from pytools.obj_array import make_obj_array
particles = make_obj_array([
rng.normal(queue, nparticles, dtype=dtype)
for i in range(dims)])
if do_plot:
pt.plot(particles[0].get(), particles[1].get(), "x")
from sumpy.tree import TreeBuilder
tb = TreeBuilder(ctx)
queue.finish()
print "building..."
tree = tb(queue, particles, max_particles_in_box=30)
print "%d boxes, testing..." % tree.nboxes
starts = tree.box_starts.get()
pcounts = tree.box_particle_counts.get()
],
name="matmul", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
knl = lp.split_iname(knl, "k", bsize)
knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
n = 512
m = 256
ell = 128
params = {'n': n, 'm': m, 'ell': ell}
group_size = bsize*bsize
n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
subgroups_per_group = div_ceil(group_size, SGS)
n_subgroups = n_workgroups*subgroups_per_group
sync_map = lp.get_synchronization_map(knl)
assert len(sync_map) == 2
assert sync_map["kernel_launch"].eval_with_dict(params) == 1
assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize
op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
f32mul = op_map[
lp.Op(np.float32, 'mul', CG.SUBGROUP)
].eval_with_dict(params)
f32add = op_map[
lp.Op(np.float32, 'add', CG.SUBGROUP)
].eval_with_dict(params)
i32ops = op_map[
lp.Op(np.int32, 'add', CG.SUBGROUP)
],
name="matmul", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
knl = lp.split_iname(knl, "k", bsize)
# knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
# knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
n = 512
m = 256
ell = 128
params = {'n': n, 'm': m, 'ell': ell}
group_size = bsize*bsize
n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
subgroups_per_group = div_ceil(group_size, SGS)
n_subgroups = n_workgroups*subgroups_per_group
mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
subgroup_size=SGS)
f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
lid_strides={0: 1},
gid_strides={1: bsize},
direction='load', variable='b',
variable_tag='mmbload',
count_granularity=CG.WORKITEM)
].eval_with_dict(params)
f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
lid_strides={1: Variable('m')},
gid_strides={0: Variable('m')*bsize},
direction='load',
def insert_em(grid, conf, ffunc):
Lx = conf.Nx*conf.NxMesh #XXX scaled length
for i in range(grid.get_Nx()):
for j in range(grid.get_Ny()):
for k in range(grid.get_Nz()):
c = grid.get_tile(i,j,k)
yee = c.get_yee()
for l in range(conf.NxMesh):
for m in range(conf.NyMesh):
for n in range(conf.NzMesh):
# get x_i,j,k
xloc0 = pytools.ind2loc(grid, (i,j,k), (l,m,n), conf)
#get x_i+1/2, x_j+1/2, x_k+1/2
xloc1 = pytools.ind2loc(grid, (i,j,k), (l+1,m, n), conf)
yloc1 = pytools.ind2loc(grid, (i,j,k), (l, m+1,n), conf)
zloc1 = pytools.ind2loc(grid, (i,j,k), (l, m, n+1), conf)
# values in Yee lattice corners
xcor = xloc0[0]
ycor = xloc0[1]
zcor = xloc0[2]
# values in Yee lattice mids
xmid = 0.5*(xloc0[0] + xloc1[0])
ymid = 0.5*(xloc0[1] + yloc1[1])
zmid = 0.5*(xloc0[2] + zloc1[2])
Lx = conf.Nx*conf.NxMesh #XXX scaled length
for i in range(grid.get_Nx()):
for j in range(grid.get_Ny()):
for k in range(grid.get_Nz()):
c = grid.get_tile(i,j,k)
yee = c.get_yee()
for l in range(conf.NxMesh):
for m in range(conf.NyMesh):
for n in range(conf.NzMesh):
# get x_i,j,k
xloc0 = pytools.ind2loc(grid, (i,j,k), (l,m,n), conf)
#get x_i+1/2, x_j+1/2, x_k+1/2
xloc1 = pytools.ind2loc(grid, (i,j,k), (l+1,m, n), conf)
yloc1 = pytools.ind2loc(grid, (i,j,k), (l, m+1,n), conf)
zloc1 = pytools.ind2loc(grid, (i,j,k), (l, m, n+1), conf)
# values in Yee lattice corners
xcor = xloc0[0]
ycor = xloc0[1]
zcor = xloc0[2]
# values in Yee lattice mids
xmid = 0.5*(xloc0[0] + xloc1[0])
ymid = 0.5*(xloc0[1] + yloc1[1])
zmid = 0.5*(xloc0[2] + zloc1[2])
#val = ffunc(xmid, ymid, zmid)
# enforce Yee lattice structure
for i in range(grid.get_Nx()):
for j in range(grid.get_Ny()):
for k in range(grid.get_Nz()):
c = grid.get_tile(i,j,k)
yee = c.get_yee()
for l in range(conf.NxMesh):
for m in range(conf.NyMesh):
for n in range(conf.NzMesh):
# get x_i,j,k
xloc0 = pytools.ind2loc(grid, (i,j,k), (l,m,n), conf)
#get x_i+1/2, x_j+1/2, x_k+1/2
xloc1 = pytools.ind2loc(grid, (i,j,k), (l+1,m, n), conf)
yloc1 = pytools.ind2loc(grid, (i,j,k), (l, m+1,n), conf)
zloc1 = pytools.ind2loc(grid, (i,j,k), (l, m, n+1), conf)
# values in Yee lattice corners
xcor = xloc0[0]
ycor = xloc0[1]
zcor = xloc0[2]
# values in Yee lattice mids
xmid = 0.5*(xloc0[0] + xloc1[0])
ymid = 0.5*(xloc0[1] + yloc1[1])
zmid = 0.5*(xloc0[2] + zloc1[2])
#val = ffunc(xmid, ymid, zmid)
# enforce Yee lattice structure
yee.ex[l,m,n] = ffunc(xmid, ycor, zcor)