Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
log_filename = None
if not plans:
raise RuntimeError, "no valid CUDA execution plans found"
if set(["cuda_no_plan", "cuda_no_plan_"+opt_name]) & debug_flags:
from pytools import argmax2
return argmax2((plan, plan.occupancy_record().occupancy)
for plan in plans), 0
max_occup = max(plan.occupancy_record().occupancy for plan in plans)
desired_occup = occupancy_slack*max_occup
if log_filename is not None:
from pytools import single_valued
feature_columns = single_valued(p.feature_columns() for p in plans)
feature_names = [fc.split()[0] for fc in feature_columns]
try:
import sqlite3 as sqlite
except ImportError:
from pysqlite2 import dbapi2 as sqlite
db_conn = sqlite.connect("plan-%s.dat" % log_filename)
try:
db_conn.execute("""
create table data (
id integer primary key autoincrement,
%s,
value real)"""
% ", ".join(feature_columns))
def index_list_backend(self, ilists):
from pytools import single_valued
ilist_length = single_valued(len(il) for il in ilists)
assert ilist_length == self.plan.dofs_per_face
from cgen import Typedef, POD
from pytools import flatten
flat_ilists_uncast = numpy.array(list(flatten(ilists)))
if numpy.max(flat_ilists_uncast) >= 256:
tp = numpy.uint16
else:
tp = numpy.uint8
flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
assert (flat_ilists == flat_ilists_uncast).all()
return GPUIndexLists(
vstacked_matrices = [
numpy.vstack(given.microblock.elements*(m,))
for m in diff_op_cls.matrices(elgroup)
]
segments = []
from pytools import single_valued
for segment_start in range(0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size):
matrices = [
m[segment_start:segment_start+self.plan.segment_size]
for m in vstacked_matrices]
matrices.append(
numpy.zeros((single_valued(m.shape[0] for m in matrices),
additional_columns))
)
diffmats = numpy.asarray(
numpy.hstack(matrices),
dtype=given.float_type,
order="C")
segments.append(buffer(diffmats))
from hedge.backends.cuda.tools import pad_and_join
from pytools import Record
class GPUDifferentiationMatrices(Record):
pass
return GPUDifferentiationMatrices(
from pyopencl.array import empty
if len(arrays) == 0:
return empty(queue, (), dtype=np.float64)
if queue is None:
for ary in arrays:
if ary.queue is not None:
queue = ary.queue
break
from pytools import all_equal, single_valued
if not all_equal(len(ary.shape) for ary in arrays):
raise ValueError("arguments must all have the same number of axes")
lead_shape = single_valued(ary.shape[:-1] for ary in arrays)
w = _builtin_sum([ary.shape[-1] for ary in arrays])
result = empty(queue, lead_shape+(w,), arrays[0].dtype)
index = 0
for ary in arrays:
result[..., index:index+ary.shape[-1]] = ary
index += ary.shape[-1]
return result
def multi_take(arrays, indices, out=None, queue=None):
if not len(arrays):
return []
assert len(indices.shape) == 1
from pytools import single_valued
a_dtype = single_valued(a.dtype for a in arrays)
a_allocator = arrays[0].dtype
context = indices.context
queue = queue or indices.queue
vec_count = len(arrays)
if out is None:
out = [Array(context, queue, indices.shape, a_dtype,
allocator=a_allocator)
for i in range(vec_count)]
else:
if len(out) != len(arrays):
raise ValueError("out and arrays must have the same length")
chunk_size = _builtin_min(vec_count, 10)
def index_list_backend(self, ilists):
from pytools import single_valued
ilist_length = single_valued(len(il) for il in ilists)
assert ilist_length == self.plan.given.dofs_per_face()
from codepy.cgen import Typedef, POD
from pytools import flatten
flat_ilists_uncast = numpy.array(list(flatten(ilists)))
if numpy.max(flat_ilists_uncast) >= 256:
tp = numpy.uint16
else:
tp = numpy.uint8
flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
assert (flat_ilists == flat_ilists_uncast).all()
return GPUIndexLists(
axis_names = AXIS_NAMES[:dimensions]
max_extent = max(
bbox["max_"+ax] - bbox["min_"+ax]
for ax in axis_names)
# make bbox square and slightly larger at the top, to ensure scaled
# coordinates are alwyas < 1
for ax in axis_names:
bbox["max_"+ax] = bbox["min_"+ax] + (1+1e-4)*max_extent
# {{{ get kernel info
from pytools import single_valued
coord_dtype = single_valued(coord.dtype for coord in particles)
particle_id_dtype = np.uint32
box_id_dtype = np.uint32
knl_info = self.get_kernel_info(dimensions, coord_dtype, particle_id_dtype, box_id_dtype)
# }}}
nparticles = single_valued(len(coord) for coord in particles)
morton_bin_counts = cl.array.empty(queue, nparticles, dtype=knl_info.morton_bin_count_dtype)
morton_nrs = cl.array.empty(queue, nparticles, dtype=np.uint8)
box_start_flags = cl.array.zeros(queue, nparticles, dtype=np.int8)
box_ids = cl.array.zeros(queue, nparticles, dtype=np.uint32)
unsplit_box_ids = cl.array.zeros(queue, nparticles, dtype=np.uint32)
split_box_ids = cl.array.zeros(queue, nparticles, dtype=np.uint32)
from pytools import div_ceil
def numpy_linear_comb(lin_comb):
assert lin_comb
scalar_dtypes = tuple(numpy.array(fac).dtype for fac, ary in lin_comb)
from pytools import single_valued, indices_in_shape, flatten, \
match_precision
from codepy.elementwise import make_linear_comb_kernel
if single_valued(is_obj_array(ary) for fac, ary in lin_comb):
oa_shape = single_valued(ary.shape for fac, ary in lin_comb)
result = numpy.zeros(oa_shape, dtype=object)
for i in indices_in_shape(oa_shape):
el_shape = single_valued(ary[i].shape for fac, ary in lin_comb)
vector_dtypes = tuple(ary[i].dtype for fac, ary in lin_comb)
scalar_dtypes = tuple(
match_precision(sd, vd)
for sd, vd in zip(scalar_dtypes, vector_dtypes))
kernel, result_dtype = make_linear_comb_kernel(
scalar_dtypes, vector_dtypes)
result[i] = numpy.zeros(el_shape, result_dtype)
kernel(result[i], *tuple(flatten((fac, ary[i]) for fac, ary in lin_comb)))
return result