# Introduction to Computational Analysis

 Pay Notebook Creator: Roy Hyunjin Han 0 Set Container: Numerical CPU with TINY Memory for 10 Minutes 0 Total 0
In [ ]:
import numpy as np
from pycuda import autoinit, driver as cuda, gpuarray
from pycuda.compiler import SourceModule
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel

vectorHost = np.array(xrange(256)).astype(np.float32)

In [ ]:
device = autoinit.device
print 'GPU Name: {}'.format(device.name())
print 'GPU Memory: {:,}'.format(device.total_memory())
print '\n'.join('{}: {:,}'.format(key, value) for key, value in device.get_attributes().iteritems())


# Define computation¶

Define computation directly.

In [ ]:
# Send vector to device
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
(2 * vectorDevice).get()

In [ ]:
# Get result from device
(gpuarray.dot(vectorDevice, vectorDevice)).get()


Define computation with ElementwiseKernel.

In [ ]:
# Prepare kernel
kernel = ElementwiseKernel(
'float *vector',
'vector[i] *= 2')
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
kernel(vectorDevice)
vectorHost = vectorDevice.get()
print vectorHost


Define computation with SourceModule.

In [ ]:
# Prepare kernel
kernel = SourceModule("""
__global__ void doublify(float *vector) {
}
""").get_function('doublify')
# Compute
vectorHost = np.array(xrange(256)).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(vectorHost.size, 1, 1))
print vectorHost

In [ ]:
# Prepare kernel
kernel = SourceModule("""
__global__ void doublify(float *vector) {{
}}
""".format(scalar=2)).get_function('doublify')
# Compute
vectorHost = np.array(xrange(256)).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(vectorHost.size, 1, 1))
print vectorHost

In [ ]:
# Prepare kernel
kernel = SourceModule("""
__global__ void doublify(float *vector) {
vector[offset] *= 2;
}
""").get_function('doublify')
# Compute
vectorHost = np.array([[1, 2], [3, 4]]).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(
vectorHost.shape[0],
vectorHost.shape[1],
1,
))
print vectorHost


Define operation with ReductionKernel.

In [ ]:
# Prepare kernel
kernel = ReductionKernel(
np.float32,                     # Precision for input and output
neutral='0',                    # Starting value for reduction
map_expr='x[i] * y[i]',         # C code defining map()
reduce_expr='a + b',            # C code defining reduce()
arguments='float *x, float *y') # Function arguments
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
resultHost = kernel(vectorDevice, vectorDevice).get()
print resultHost


# Time computation¶

In [ ]:
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)

# Start timer
start = cuda.Event()
end = cuda.Event()
start.record()

# Get result from device
vectorHost = (2 * vectorDevice).get()

# End timer
end.record()
end.synchronize()
print '{:,} seconds'.format(start.time_till(end) * 1e-3)