How to use pytools - 10 common examples

To help you get started, we’ve selected a few pytools examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inducer / pycuda / pycuda / gpuarray.py View on Github external
@memoize
def _splay_backend(n, dev):
    # heavily modified from cublas
    from pycuda.tools import DeviceData
    devdata = DeviceData(dev)

    min_threads = devdata.warp_size
    max_threads = 128
    max_blocks = 4 * devdata.thread_blocks_per_mp \
            * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)

    if n < min_threads:
        block_count = 1
        threads_per_block = min_threads
    elif n < (max_blocks * min_threads):
        block_count = (n + min_threads - 1) // min_threads
        threads_per_block = min_threads
github inducer / pycuda / examples / transpose.py View on Github external
@memoize
def _get_transpose_kernel():
    mod = SourceModule("""
      #define BLOCK_SIZE %(block_size)d
      #define A_BLOCK_STRIDE (BLOCK_SIZE*a_width)
      #define A_T_BLOCK_STRIDE (BLOCK_SIZE*a_height)

      __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
      {
        // Base indices in A and A_t
        int base_idx_a   = blockIdx.x*BLOCK_SIZE + blockIdx.y*A_BLOCK_STRIDE;
        int base_idx_a_t = blockIdx.y*BLOCK_SIZE + blockIdx.x*A_T_BLOCK_STRIDE;

        // Global indices in A and A_t
        int glob_idx_a   = base_idx_a + threadIdx.x + a_width*threadIdx.y;
        int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height*threadIdx.y;
github quantumsim / quantumsim / quantumsim / pauli_vectors / cuda.py View on Github external
dint = 1
        for i in sorted(rest_shape):
            if i * dint > 256 // (dim0_out * dim1_out):
                break
            else:
                dint *= i

        # dim_a_out, dim_b_out, d_internal (arbitrary)
        block = (dim0_out, dim1_out, dint)
        blocksize = dim1_out * dim0_out * dint
        sh_mem_size = dint * dim1_in * dim0_in  # + ptm.size
        grid_size = max(1, (new_size - 1) // blocksize + 1)
        grid = (grid_size, 1, 1)

        dim_z = pytools.product(self._data.shape[qubit1 + 1:])
        dim_y = pytools.product(self._data.shape[qubit0 + 1:qubit1])
        dim_rho = new_size  # self.data.size

        _two_qubit_general_ptm.prepared_call(
            grid,
            block,
            self._data.gpudata,
            self._work_data.gpudata,
            ptm_gpu.gpudata,
            dim0_in, dim1_in,
            dim_z,
            dim_y,
            dim_rho,
            shared_size=8 * sh_mem_size)

        self._data, self._work_data = self._work_data, self._data
github quantumsim / quantumsim / quantumsim / pauli_vectors / cuda.py View on Github external
dint = 1
        for i in sorted(rest_shape):
            if i * dint > 256 // (dim0_out * dim1_out):
                break
            else:
                dint *= i

        # dim_a_out, dim_b_out, d_internal (arbitrary)
        block = (dim0_out, dim1_out, dint)
        blocksize = dim1_out * dim0_out * dint
        sh_mem_size = dint * dim1_in * dim0_in  # + ptm.size
        grid_size = max(1, (new_size - 1) // blocksize + 1)
        grid = (grid_size, 1, 1)

        dim_z = pytools.product(self._data.shape[qubit1 + 1:])
        dim_y = pytools.product(self._data.shape[qubit0 + 1:qubit1])
        dim_rho = new_size  # self.data.size

        _two_qubit_general_ptm.prepared_call(
            grid,
            block,
            self._data.gpudata,
            self._work_data.gpudata,
            ptm_gpu.gpudata,
            dim0_in, dim1_in,
            dim_z,
            dim_y,
            dim_rho,
            shared_size=8 * sh_mem_size)

        self._data, self._work_data = self._work_data, self._data
github inducer / sumpy / test / test_tree.py View on Github external
@pytools.test.mark_test.opencl
def test_tree(ctx_getter, do_plot=False):
    ctx = ctx_getter()
    queue = cl.CommandQueue(ctx)

    #for dims in [2, 3]:
    for dims in [2]:
        nparticles = 10000
        dtype = np.float64

        from pyopencl.clrandom import RanluxGenerator
        rng = RanluxGenerator(queue, seed=15)

        from pytools.obj_array import make_obj_array
        particles = make_obj_array([
            rng.normal(queue, nparticles, dtype=dtype)
            for i in range(dims)])

        if do_plot:
            pt.plot(particles[0].get(), particles[1].get(), "x")

        from sumpy.tree import TreeBuilder
        tb = TreeBuilder(ctx)

        queue.finish()
        print "building..."
        tree = tb(queue, particles, max_particles_in_box=30)
        print "%d boxes, testing..." % tree.nboxes

        starts = tree.box_starts.get()
        pcounts = tree.box_particle_counts.get()
github inducer / loopy / test / test_statistics.py View on Github external
],
            name="matmul", assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    group_size = bsize*bsize
    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    sync_map = lp.get_synchronization_map(knl)
    assert len(sync_map) == 2
    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    f32mul = op_map[
                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
                        ].eval_with_dict(params)
    f32add = op_map[
                        lp.Op(np.float32, 'add', CG.SUBGROUP)
                        ].eval_with_dict(params)
    i32ops = op_map[
                        lp.Op(np.int32, 'add', CG.SUBGROUP)
github inducer / loopy / test / test_statistics.py View on Github external
],
            name="matmul", assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    group_size = bsize*bsize
    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                           subgroup_size=SGS)

    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1},
                             gid_strides={1: bsize},
                             direction='load', variable='b',
                             variable_tag='mmbload',
                             count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={1: Variable('m')},
                             gid_strides={0: Variable('m')*bsize},
                             direction='load',
github natj / runko / tests / test_pic.py View on Github external
def insert_em(grid, conf, ffunc):

    Lx  = conf.Nx*conf.NxMesh #XXX scaled length
    for i in range(grid.get_Nx()):
        for j in range(grid.get_Ny()):
            for k in range(grid.get_Nz()):
                c = grid.get_tile(i,j,k)
                yee = c.get_yee()

                for l in range(conf.NxMesh):
                    for m in range(conf.NyMesh):
                        for n in range(conf.NzMesh):

                            # get x_i,j,k
                            xloc0 = pytools.ind2loc(grid, (i,j,k), (l,m,n), conf)

                            #get x_i+1/2, x_j+1/2, x_k+1/2
                            xloc1 = pytools.ind2loc(grid, (i,j,k), (l+1,m,  n),   conf)
                            yloc1 = pytools.ind2loc(grid, (i,j,k), (l,  m+1,n),   conf)
                            zloc1 = pytools.ind2loc(grid, (i,j,k), (l,  m,  n+1), conf)

                            # values in Yee lattice corners
                            xcor = xloc0[0]
                            ycor = xloc0[1]
                            zcor = xloc0[2]

                            # values in Yee lattice mids
                            xmid = 0.5*(xloc0[0] + xloc1[0])
                            ymid = 0.5*(xloc0[1] + yloc1[1])
                            zmid = 0.5*(xloc0[2] + zloc1[2])
github natj / runko / tests / test_pic.py View on Github external
Lx  = conf.Nx*conf.NxMesh #XXX scaled length
    for i in range(grid.get_Nx()):
        for j in range(grid.get_Ny()):
            for k in range(grid.get_Nz()):
                c = grid.get_tile(i,j,k)
                yee = c.get_yee()

                for l in range(conf.NxMesh):
                    for m in range(conf.NyMesh):
                        for n in range(conf.NzMesh):

                            # get x_i,j,k
                            xloc0 = pytools.ind2loc(grid, (i,j,k), (l,m,n), conf)

                            #get x_i+1/2, x_j+1/2, x_k+1/2
                            xloc1 = pytools.ind2loc(grid, (i,j,k), (l+1,m,  n),   conf)
                            yloc1 = pytools.ind2loc(grid, (i,j,k), (l,  m+1,n),   conf)
                            zloc1 = pytools.ind2loc(grid, (i,j,k), (l,  m,  n+1), conf)

                            # values in Yee lattice corners
                            xcor = xloc0[0]
                            ycor = xloc0[1]
                            zcor = xloc0[2]

                            # values in Yee lattice mids
                            xmid = 0.5*(xloc0[0] + xloc1[0])
                            ymid = 0.5*(xloc0[1] + yloc1[1])
                            zmid = 0.5*(xloc0[2] + zloc1[2])

                            #val = ffunc(xmid, ymid, zmid)

                            # enforce Yee lattice structure
github natj / runko / tests / test_pic.py View on Github external
for i in range(grid.get_Nx()):
        for j in range(grid.get_Ny()):
            for k in range(grid.get_Nz()):
                c = grid.get_tile(i,j,k)
                yee = c.get_yee()

                for l in range(conf.NxMesh):
                    for m in range(conf.NyMesh):
                        for n in range(conf.NzMesh):

                            # get x_i,j,k
                            xloc0 = pytools.ind2loc(grid, (i,j,k), (l,m,n), conf)

                            #get x_i+1/2, x_j+1/2, x_k+1/2
                            xloc1 = pytools.ind2loc(grid, (i,j,k), (l+1,m,  n),   conf)
                            yloc1 = pytools.ind2loc(grid, (i,j,k), (l,  m+1,n),   conf)
                            zloc1 = pytools.ind2loc(grid, (i,j,k), (l,  m,  n+1), conf)

                            # values in Yee lattice corners
                            xcor = xloc0[0]
                            ycor = xloc0[1]
                            zcor = xloc0[2]

                            # values in Yee lattice mids
                            xmid = 0.5*(xloc0[0] + xloc1[0])
                            ymid = 0.5*(xloc0[1] + yloc1[1])
                            zmid = 0.5*(xloc0[2] + zloc1[2])

                            #val = ffunc(xmid, ymid, zmid)

                            # enforce Yee lattice structure
                            yee.ex[l,m,n] = ffunc(xmid, ycor, zcor)