Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Parameters
Nz = 2048
Nr = 256
rmax = 50.e-6
m = 0
# Initialize the random test_field
interp_field_r = np.random.rand(Nz, Nr) + 1.j*np.random.rand(Nz, Nr)
interp_field_t = np.random.rand(Nz, Nr) + 1.j*np.random.rand(Nz, Nr)
d_interp_field_r = cuda.to_device( interp_field_r )
d_interp_field_t = cuda.to_device( interp_field_t )
# Initialize the field in spectral space
spect_field_p = np.empty_like( interp_field_r )
spect_field_m = np.empty_like( interp_field_t )
d_spect_field_p = cuda.to_device( spect_field_p )
d_spect_field_m = cuda.to_device( spect_field_m )
# Initialize the field after back and forth transformation
back_field_r = np.empty_like( interp_field_r )
back_field_t = np.empty_like( interp_field_t )
d_back_field_r = cuda.to_device( back_field_r )
d_back_field_t = cuda.to_device( back_field_t )
# ----------------
# Scalar transform
# ----------------
print( '\n ### Scalar transform \n' )
# Perform the transform on the CPU
trans_cpu = SpectralTransformer( Nz, Nr, m, rmax )
# Do a loop so as to get the fastest time
# and remove compilation time
tmin = 1.
results = []
for groupby_result in groupby_results:
list_of_indices = list(
np.unique(groupby_result[check_list[-1]].to_array().astype(int))
)
groupby_as_ndarray = cuda.to_device(
groupby_result.to_pandas().values.astype(float)
)
del groupby_result
gc.collect()
max_s = int((max_1 - min_1) / stride_1) + 1
min_s = int((max_2 - min_2) / stride_2) + 1
result = cuda.to_device(
np.zeros(shape=(min_s, max_s)).astype(np.float64)
)
calc_cumsum_data_tile[64, 64](groupby_as_ndarray, result)
if not cumsum:
result_np = result.copy_to_host()
else:
result_np = np.cumsum(result.copy_to_host(), axis=1)
result_temp = format_result(result_np, return_format)
results.append(result_temp[result_temp.index.isin(list_of_indices)])
if len(results) == 1:
return results[0]
def send_to_gpu(self):
"""
Transfer the tracking data from the CPU to the GPU
"""
self.id = cuda.to_device( self.id )
def random(size):
import numba.cuda as cuda
import numpy as np
arr = np.random.randint(low=0, high=255, size=size,
dtype=np.uint8)
return cuda.to_device(arr)
C[y, x] = 0
for i in range(n):
C[y, x] += A[y, i] * B[i, x]
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.empty_like(A)
print("N = %d x %d" % (n, n))
s = time()
stream = cuda.stream()
with stream.auto_synchronize():
dA = cuda.to_device(A, stream)
dB = cuda.to_device(B, stream)
dC = cuda.to_device(C, stream)
cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
dC.to_host(stream)
e = time()
tcuda = e - s
# Host compute
Amat = np.matrix(A)
Bmat = np.matrix(B)
s = time()
Cans = Amat * Bmat
e = time()
tcpu = e - s
time0 = time.time()
for i in range(iterations):
black_scholes_numba(callResultNumba, putResultNumba, stockPrice,
optionStrike, optionYears, RISKFREE, VOLATILITY)
time1 = time.time()
print("Numba Time: %f msec" %
((1000 * (time1 - time0)) / iterations))
time0 = time.time()
blockdim = 1024, 1
griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
stream = cuda.stream()
d_callResult = cuda.to_device(callResultCuda, stream)
d_putResult = cuda.to_device(putResultCuda, stream)
d_stockPrice = cuda.to_device(stockPrice, stream)
d_optionStrike = cuda.to_device(optionStrike, stream)
d_optionYears = cuda.to_device(optionYears, stream)
time1 = time.time()
for i in range(iterations):
black_scholes_cuda[griddim, blockdim, stream](
d_callResult, d_putResult, d_stockPrice, d_optionStrike,
d_optionYears, RISKFREE, VOLATILITY)
d_callResult.to_host(stream)
d_putResult.to_host(stream)
stream.synchronize()
time2 = time.time()
dt = (time1 - time0) * 10 + (time2 - time1)
print("Numba / CUDA time: %f msec" % ((1000 * dt) / iterations))
delta = np.abs(callResultNumpy - callResultCuda)
L1norm = delta.sum() / np.abs(callResultNumpy).sum()
print("L1 norm: %E" % L1norm)
Anew[j, 0] = 1.0
print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))
timer = time.time()
iter = 0
blockdim = (32, 32)
griddim = (NN//blockdim[0], NM//blockdim[1])
error_grid = np.zeros_like(A)
stream = cuda.stream()
dA = cuda.to_device(A, stream) # to device and don't come back
dAnew = cuda.to_device(Anew, stream) # to device and don't come back
derror_grid = cuda.to_device(error_grid, stream)
while error > tol and iter < iter_max:
assert error_grid.dtype == np.float64
jacobi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
derror_grid.to_host(stream)
# error_grid is available on host
stream.synchronize()
error = np.abs(error_grid).max()
# swap dA and dAnew
def numba_gpu_histogram(a, bins):
# Move data to GPU so we can do two operations on it
a_gpu = cuda.to_device(a)
# a_gpu = a
# index_gpu = cuda.to_device(np.array(index))
### Find min and max value in array
dtype_min, dtype_max = dtype_min_max(a.dtype)
# Put them in the array in reverse order so that they will be replaced by the first element in the array
min_max_array_gpu = cuda.to_device(np.array([dtype_max, dtype_min], dtype=a.dtype))
min_max[64, 64](a_gpu, min_max_array_gpu)
a_min, a_max = min_max_array_gpu.copy_to_host()
# print(a_min, a_max)
# SPEEDTIP: Skip this step if you don't need to reproduce the NumPy histogram edge array
bin_edges = get_bin_edges(bins, a_min, a_max) # Doing this on CPU for now
# counter = cuda.to_device(np.array([0]))
### Bin the data into a histogram
# print(bins)
histogram_out = cuda.to_device(np.zeros(shape=(bins,), dtype=np.int32))
histogram[64, 64](a_gpu, a_min, a_max, histogram_out)
# print(counter.copy_to_host())
# sol = histogram_out.copy_to_host()
# print(sol)
# print(len(sol))
return histogram_out.copy_to_host(), bin_edges
device_T_A = cuda.to_device(T_A)
device_T_B = cuda.to_device(T_B)
device_M_T = cuda.to_device(M_T)
device_Σ_T = cuda.to_device(Σ_T)
device_QT_odd = cuda.to_device(QT)
device_QT_even = cuda.to_device(QT)
device_QT_first = cuda.to_device(QT_first)
device_μ_Q = cuda.to_device(μ_Q)
device_σ_Q = cuda.to_device(σ_Q)
profile = np.empty((k, 3)) # float64
indices = np.empty((k, 3)) # int64
profile[:] = np.inf
indices[:, :] = -1
device_profile = cuda.to_device(profile)
device_indices = cuda.to_device(indices)
_compute_and_update_PI_kernel[blocks_per_grid, threads_per_block](
range_start - 1,
device_T_A,
device_T_B,
m,
device_QT_even,
device_QT_odd,
device_QT_first,
device_M_T,
device_Σ_T,
device_μ_Q,
device_σ_Q,
k,
ignore_trivial,