Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
msg_generic_problem = "Error: CUDA device intialisation problem."
msg = getattr(e, 'msg', None)
if msg is not None:
if msg_not_found in msg:
err_msg = msg_not_found + msg_end
elif msg_disabled_by_user in msg:
err_msg = msg_disabled_by_user + msg_end
else:
err_msg = msg_generic_problem + " Message:" + msg
else:
err_msg = msg_generic_problem + " " + str(e)
# Best effort error report
print("%s\nError class: %s" % (err_msg, str(type(e))))
else:
try:
cu.detect()
dv = ct.c_int(0)
cudriver.cuDriverGetVersion(ct.byref(dv))
print(fmt % ("CUDA driver version", dv.value))
print("CUDA libraries:")
cudadrv.libs.test(sys.platform, print_paths=False)
except:
print(
"Error: Probing CUDA failed (device and driver present, runtime problem?)\n")
print("")
print("__ROC Information__")
roc_is_available = roc.is_available()
print(fmt % ("ROC available", roc_is_available))
toolchains = []
try:
@cuda.jit
def sqplus2(input_data, output_data):
for i in range(len(input_data)):
d = input_data[i]
output_data[i] = d * d + 2
def histogram(x, x_range, histogram_out):
nbins = histogram_out.shape[0]
xmin, xmax = x_range
bin_width = (xmax - xmin) / nbins
start = cuda.grid(1)
stride = cuda.gridsize(1)
for i in range(start, x.shape[0], stride):
# note that calling a numba.jit function from CUDA automatically
# compiles an equivalent CUDA device function!
bin_number = compute_bin(x[i], nbins, xmin, xmax)
# counter[0] = counter[0] + 1
if bin_number >= 0 and bin_number < histogram_out.shape[0]:
cuda.atomic.add(histogram_out, bin_number, 1)
class Cuda_shared_array(MacroTemplate):
key = cuda.shared.array
class Cuda_local_array(MacroTemplate):
key = cuda.local.array
class Cuda_const_arraylike(MacroTemplate):
key = cuda.const.array_like
@intrinsic
class Cuda_syncthreads(ConcreteTemplate):
key = cuda.syncthreads
cases = [signature(types.none)]
@intrinsic
class Cuda_syncthreads_count(ConcreteTemplate):
key = cuda.syncthreads_count
cases = [signature(types.i4, types.i4)]
@intrinsic
class Cuda_syncthreads_and(ConcreteTemplate):
key = cuda.syncthreads_and
cases = [signature(types.i4, types.i4)]
@intrinsic
def parent():
arr = np.arange(10)
darr = cuda.to_device(arr)
ipch = darr.get_ipc_handle()
# launch child proc
mpc = mp.get_context('spawn')
queue = mpc.Queue()
childproc = mpc.Process(target=child, args=[queue])
childproc.start()
queue.put(ipch)
childproc.join(1)
hostarr = queue.get()
print('original array:', arr)
# device array is modified by child process
print('device array:', darr.copy_to_host())
print('returned host array', hostarr)
@cuda.jit
def cuda_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) :
"""
Combine the arrays buffer_r and buffer_t to produce the
arrays buffer_p and buffer_m, according to the rules of
the Fourier-Hankel decomposition (see associated paper)
"""
# Set up cuda grid
iz, ir = cuda.grid(2)
if (iz < buffer_r.shape[0]) and (ir < buffer_r.shape[1]) :
# Use intermediate variables, as the arrays
# buffer_r and buffer_t may actually point to the same
# object as buffer_p and buffer_m, for economy of memory
value_r = buffer_r[iz, ir]
value_t = buffer_t[iz, ir]
# Combine the values
ux, uy, uz, Ex, Ey, Ez, Bx, By, Bz, w, w_times_level ):
"""
For each ion macroparticle, decide whether it is going to
be further ionized during this timestep, based on the ADK rate.
Increment the elements in `ionization_level` accordingly, and update the
`w_times_level` of the ions to take into account the change in level
of the corresponding macroparticle.
For the purpose of counting and creating the corresponding electrons,
`is_ionized` (one element per macroparticle) is set to 1 at the position
of the ionized ions, and `n_ionized` (one element per batch) counts
the total number of ionized particles in the current batch.
"""
# Loop over batches of particles
i_batch = cuda.grid(1)
if i_batch < N_batch:
# Set the count of ionized particles in the batch to 0
n_ionized[i_batch] = 0
# Loop through the batch
N_max = min( (i_batch+1)*batch_size, Ntot )
for ip in range( i_batch*batch_size, N_max ):
# Skip the ionization routine, if the maximal ionization level
# has already been reached for this macroparticle
level = ionization_level[ip]
if level >= level_max:
is_ionized[ip] = 0
continue
This implements the algorithm documented in
http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
:param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
the device its stream will be used to perform the transpose (and to copy
`b` to the device if necessary).
"""
# prefer `a`'s stream if
stream = getattr(a, 'stream', 0)
if not b:
cols, rows = a.shape
strides = a.dtype.itemsize * cols, a.dtype.itemsize
b = cuda.cudadrv.devicearray.DeviceNDArray(
(rows, cols),
strides,
dtype=a.dtype,
stream=stream)
dt=nps.from_dtype(a.dtype)
tpb = driver.get_device().MAX_THREADS_PER_BLOCK
# we need to factor available threads into x and y axis
tile_width = int(math.pow(2, math.log(tpb, 2)/2))
tile_height = int(tpb / tile_width)
tile_shape=(tile_height, tile_width + 1)
@cuda.jit
def kernel(input, output):
def select_device(dev, close=True):
"""
Use numbas numba to select the given device, optionally
closing and opening up a new cuda context if it fails.
:param dev: int device to select
:param close: bool close the cuda context and create new one?
"""
if numba.cuda.get_current_device().id != dev:
logging.warn("Selecting device " + str(dev))
if close:
numba.cuda.close()
numba.cuda.select_device(dev)
if dev != numba.cuda.get_current_device().id:
logging.warn("Current device " +
str(numba.cuda.get_current_device()) +
" does not match expected " + str(dev))
def build_grid_2d2_cuda( x, y, fntau, tau, nspread ):
nf1 = fntau.shape[0]
nf2 = fntau.shape[1]
hx = 2 * np.pi / nf1
hy = 2 * np.pi / nf2
c = np.zeros(x.shape,dtype = fntau.dtype)
device = cuda.get_current_device()
n = x.shape[0] #number of kernels in the computing
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
gaussker_2d2_cuda[bpg, tpb](x, y, c, hx, hy, nf1, nf2, nspread, tau, fntau )
return c/(nf1*nf2)