Timing, profiling and compiling

Timing, profiling and compiling#

Timing#

import numpy as np
import time
rng = np.random.default_rng(3945)

A straightforward way to time code is to read the current time with time.time(). For example, consider the following matrix multiplication:

n = 10**3
A = rng.normal(size = (n,n))

t_start = time.time()
B = A@A
t_end = time.time()

print('Time taken = ', t_end-t_start)
print('GFLOPS = ', 2*n**3 / (t_end-t_start) / 1e9)

Time taken =  0.01887202262878418
GFLOPS =  105.97698187101257

As well as the time taken, this also outputs the number of floating point operations per second (in units of \(10^9\) FLOPS or GFLOPS). I’ve used the fact that for each element of the new matrix we have to do \(n\) multiplications and \(\approx n\) additions, so we expect the total number of operations to be \(2n^3\).

%%timeit -n 3 -r 10
# -n gives the number of times to run for each time measurement
# -r is the number of times to repeat the time measurement
B = A@A

21.9 ms ± 3.36 ms per loop (mean ± std. dev. of 10 runs, 3 loops each)

Question

Does the number of GFLOPS we obtain here make sense? What should it be? How does it depend on \(N\)?

Profiling#

import matplotlib.pyplot as plt
import scipy.integrate

We can also look in more detail at which parts of the code take the most time. This is known as profiling. As an example, let’s take a look at the code from the homework solutions that solves the Ising model.

def run_ising(beta):
    
    N = 30
    Nspins = N*N
    ntrials = 10**5

    # Populate the array with random spins (+1 or -1)
    #spins = -1 + rng.integers(2, size = Nspins) * 2
    spins = np.ones(Nspins)

    M = np.zeros(ntrials)
    M[0] = np.sum(spins)/Nspins  # starting point

    # Compute the random numbers to use for accepting the jump in advance
    u = rng.uniform(size=ntrials)

    for i in range(ntrials-1):
        
        # Choose spin at random
        j = rng.integers(Nspins)
        
        # Compute energy change if this spin were to flip
        delta_E = 2*spins[j] * (spins[j-1] + spins[(j+1) % Nspins] + spins[j-N] + spins[(j+N) % Nspins])
    
        # Attempt flip
        if u[i] <= np.exp(-beta*delta_E):
            spins[j] = -spins[j]
 
        # Store magnetization
        M[i+1] = np.sum(spins)/Nspins
        
    # remove the first 20% (burn in)
    return M[int(ntrials/5):]

def run_ising_optimized(beta):
    
    N = 30
    Nspins = N*N
    ntrials = 10**5

    # Populate the array with random spins (+1 or -1)
    #spins = -1 + rng.integers(2, size = Nspins) * 2
    spins = np.ones(Nspins)

    M = np.zeros(ntrials)
    M[0] = np.sum(spins)/Nspins  # starting point

    # Compute the random numbers to use for accepting the jump in advance
    u = rng.uniform(size=ntrials)
    jvals = rng.integers(Nspins, size=ntrials)
    
    for i in range(ntrials-1):
        
        # Choose spin at random
        #j = rng.integers(Nspins)
        j = jvals[i]
        
        # Compute energy change if this spin were to flip
        delta_E = 2*spins[j] * (spins[j-1] + spins[(j+1) % Nspins] + spins[j-N] + spins[(j+N) % Nspins])
    
        # Attempt flip
        if u[i] <= np.exp(-beta*delta_E):
            spins[j] = -spins[j]
            M[i+1] = M[i] + 2*spins[j]/Nspins
        else:
            M[i+1] = M[i]
        
        # Store magnetization
        #M[i+1] = np.sum(spins)/Nspins
        
    # remove the first 20% (burn in)
    return M[int(ntrials/5):]

%%prun -q
# remove the -q above to generate output
# in class we used the profiling output to optimize--
# compare run_ising with run_ising_optimized 
# with a factor of 3 speed up

T_vals = np.arange(1.0,4.05,0.1)

M_vals = np.zeros_like(T_vals)
chi_vals = np.zeros_like(T_vals)

for i, T in enumerate(T_vals):
    M = run_ising(1/T)
    M_vals[i] = np.mean(np.abs(M))
    chi_vals[i] = np.var(M)

    if (abs(np.floor(T*2)-(T*2))<1e-5):
        print("T = ", T)
        t = np.arange(len(M))
        plt.plot(t, M, label = "T=%g" % (T,))
        
plt.ylim((-1.05,1.05))
plt.legend()
plt.show()
        
plt.clf()
plt.errorbar(T_vals, M_vals, np.sqrt(chi_vals), fmt='.')
plt.ylabel(r'$M$')
plt.xlabel(r'$T$')
plt.show()

plt.clf()
plt.plot(T_vals, chi_vals)
plt.ylabel(r'$\sigma_M^2$')
plt.xlabel(r'$T$')
plt.show()

T =  1.0

T =  1.5000000000000004

T =  2.000000000000001

T =  2.5000000000000013

T =  3.0000000000000018

T =  3.500000000000002

T =  4.000000000000003

_images/9228d0c434d77ef892ec7460e1af9e9d523caa8243088a6d5e833df9a873c38f.png

_images/d5562212eafade86afbee9bd6fdefb5b02ce8b69234dab76f0ec9a76cb78692e.png

_images/28f78ae80515c81e2c28abb4c4016b4e5dfcb0a70212a186550fbbcec2db98ed.png

Keeping track of memory#

As an example, here is the code we used for 2D diffusion. We can use tracemalloc to look at the memory usage.

# MBs needed for 128x128 array of doubles
128*128*8/1e6

0.131072

import tracemalloc

tracemalloc.start()


def construct_matrices(n, alpha):
    # This is the matrix for the explicit update on the RHS
    AR = np.diag((1-alpha)*np.ones(n)) + np.diag(alpha/2*np.ones(n-1),k=1) + np.diag(alpha/2*np.ones(n-1),k=-1)

    # and for the implicit update on the LHS
    b = 1 + alpha*np.ones(n)
    a = -alpha/2*np.ones(n)
    a[0] = 0
    c = -alpha/2*np.ones(n)
    c[-1] = 0
    AL = np.row_stack((a,b,c))

    return AL, AR

def gaussian(x, y, x0, t):
    # Green's function for the diffusion equation
    dx = x-x0[0]
    dy = y-x0[1]
    return np.exp(-(dx**2 + dy**2)/(4*t)) / (4*np.pi*t)

# set up the grid and initial condition
n = 128
T = np.zeros((n,n))
x = np.linspace(0,1,n)
dx = x[1]-x[0]
xx, yy = np.meshgrid(x, x)
x0 = (0.5, 0.5)   # location of the gaussian
t0 = 0.001   # initial time
T = gaussian(xx, yy, x0, t0)

# timesteps
alpha = 2
nsteps = 80
dt = alpha * dx**2
AL, AR = construct_matrices(n, alpha)

%matplotlib
image = plt.imshow(T)
plt.title("%d: t=%lg" % (0,t0))
plt.colorbar()
plt.show()

Tnew = np.zeros_like(T)
B = np.zeros_like(T)

# We take twice as many steps here because we are doing half-timesteps
for i in range(2*nsteps):

    # Compute the right-hand side matrix row by row
    for j in range(n):
        B[j,:] = AR @ T[j,:]

    # and now solve column by column for the new T
    for j in range(n):
        Tnew[:,j] = scipy.linalg.solve_banded((1,1), AL, B[:,j])

    # take the transpose to swap directions for the next half-step
    T = Tnew.T
    
    if i%2 == 1:
        ti = t0 + dt*(1 + i//2)
        #plt.clf()
        #plt.imshow(T)
        #plt.title("%d: t=%lg" % (1 + i//2,ti))
        #plt.colorbar()
        #plt.show()
        #plt.pause(1e-3)

        # in class we changed the plotting to the following:        
        image.set_data(T)
        plt.draw()
        plt.pause(1e-3)
        # rather than make a new plot each step, we modify the plot data

    if i%10 == 0:
        current, peak = tracemalloc.get_traced_memory()
        print("Step ", i, ": current/peak memory usage is", current / 10**6, " ", peak/10**6, "MB")

# compare the result with the analytic one
ti = t0 + dt*(1 + i//2)
delta = (T - gaussian(xx, yy, x0, ti)) / np.max(T)
print("T range = ", np.min(T), np.max(T))
print("delta range = ", np.min(delta), np.max(delta))

%matplotlib inline
plt.clf()
fig = plt.figure(figsize=(8,4))
plt.subplot(121)
plt.imshow(T)
plt.title(r'$T$')
plt.colorbar(shrink=0.7)
plt.subplot(122)
plt.imshow(delta)
plt.title(r'$\Delta T/T$')
plt.colorbar(shrink=0.7)
plt.tight_layout()
plt.show()

tracemalloc.stop()

Using matplotlib backend: <object object at 0x1057ab3b0>
Step  0 : current/peak memory usage is 1.571314   1.746036 MB

Step  10 : current/peak memory usage is 2.171768   24.627224 MB

Step  20 : current/peak memory usage is 2.211372   24.670008 MB

Step  30 : current/peak memory usage is 2.240212   24.701018 MB

Step  40 : current/peak memory usage is 2.274824   24.731646 MB

Step  50 : current/peak memory usage is 2.315045   24.781835 MB

Step  60 : current/peak memory usage is 2.325933   24.791328 MB

Step  70 : current/peak memory usage is 2.337578   24.801968 MB

Step  80 : current/peak memory usage is 2.349743   24.813838 MB

Step  90 : current/peak memory usage is 2.342981   24.822751 MB

Step  100 : current/peak memory usage is 2.355003   24.822751 MB

Step  110 : current/peak memory usage is 2.365809   24.829827 MB

Step  120 : current/peak memory usage is 2.375148   24.840051 MB

Step  130 : current/peak memory usage is 2.385793   24.850638 MB

Step  140 : current/peak memory usage is 2.378651   24.850638 MB

Step  150 : current/peak memory usage is 2.389469   24.854347 MB

T range =  7.374417235632719e-06 7.286560064543426
delta range =  -0.0022616334982004526 0.0006078416564473385

<Figure size 1280x960 with 0 Axes>

_images/573fcc9ef9886554cafd06281a566e648412ecfcdc84aa6801516d9245dde629.png

Compiling python#

Python can be slow because it is an interpreted rather than compiled language. As an example, let’s write our own implementation of matrix multiplication:

def matrix_multiply(A):
    n = np.shape(A)[0]
    B = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            for k in range(n):
                B[i,j] += A[i,k] * A[k,j]
    return B

n = 200
A = rng.normal(size = (n,n))

t_start = time.time()
B = matrix_multiply(A)
t_end = time.time()

print('Time taken = ', t_end-t_start)
print('GFLOPS = ', 2*n**3 / (t_end-t_start) / 1e9)

Time taken =  2.569782018661499
GFLOPS =  0.006226209026216856

One way to pre-compile python code is to use Cython. Let’s give it a try:

%load_ext Cython

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[11], line 1
----> 1 get_ipython().run_line_magic('load_ext', 'Cython')

File /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages/IPython/core/interactiveshell.py:2480, in InteractiveShell.run_line_magic(self, magic_name, line, _stack_depth)
   2478     kwargs['local_ns'] = self.get_local_scope(stack_depth)
   2479 with self.builtin_trap:
-> 2480     result = fn(*args, **kwargs)
   2482 # The code below prevents the output from being displayed
   2483 # when using magics with decorator @output_can_be_silenced
   2484 # when the last Python token in the expression is a ';'.
   2485 if getattr(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, False):

File /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages/IPython/core/magics/extension.py:33, in ExtensionMagics.load_ext(self, module_str)
     31 if not module_str:
     32     raise UsageError('Missing module name.')
---> 33 res = self.shell.extension_manager.load_extension(module_str)
     35 if res == 'already loaded':
     36     print("The %s extension is already loaded. To reload it, use:" % module_str)

File /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages/IPython/core/extensions.py:62, in ExtensionManager.load_extension(self, module_str)
     55 """Load an IPython extension by its module name.
     56 
     57 Returns the string "already loaded" if the extension is already loaded,
     58 "no load function" if the module doesn't have a load_ipython_extension
     59 function, or None if it succeeded.
     60 """
     61 try:
---> 62     return self._load_extension(module_str)
     63 except ModuleNotFoundError:
     64     if module_str in BUILTINS_EXTS:

File /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages/IPython/core/extensions.py:77, in ExtensionManager._load_extension(self, module_str)
     75 with self.shell.builtin_trap:
     76     if module_str not in sys.modules:
---> 77         mod = import_module(module_str)
     78     mod = sys.modules[module_str]
     79     if self._call_load_ipython_extension(mod):

File /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/importlib/__init__.py:90, in import_module(name, package)
     88             break
     89         level += 1
---> 90 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1387, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1360, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1324, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'Cython'

%%cython
# Add the flag "-a" to the line above to turn on the output

# in class we went through the modifications below
# - using cdef to declare variable types
# - using the decorators to turn off bounds checking etc.
# Basically try to turn the yellow lines in the output to white and you 
# should get a significant speed up

import numpy as np

cimport cython
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.initializedcheck(False)

def matrix_multiply_cython(double[:,:] A, double[:,:] B):
    cdef int i,j,k
    cdef int n = A.shape[0]
    for i in range(n):
        for j in range(n):
            for k in range(n):
                B[i,j] += A[i,k] * A[k,j]
    return B

n = 10**3
A = rng.normal(size = (n,n))
B = np.zeros_like(A)

t_start = time.time()
matrix_multiply_cython(A,B)
t_end = time.time()

print('Time taken = ', t_end-t_start)
print('GFLOPS = ', 2*n**3 / (t_end-t_start) / 1e9)

Time taken =  1.2505810260772705
GFLOPS =  1.5992566321539767

Just in time (JIT) compilation using Numba:

import numba as nb

@nb.njit(parallel = True)
def matrix_multiply(A):
    n = np.shape(A)[0]
    B = np.zeros((n,n))
    # note the use of prange in the next line for parallel loops
    for i in nb.prange(n):
        for k in range(n):
            for j in range(n):
                B[i,j] += A[i,k] * A[k,j]
    return B

n = 10**3
A = rng.normal(size = (n,n))

t_start = time.time()
B = matrix_multiply(A)
t_end = time.time()

print('Time taken = ', t_end-t_start)
print('GFLOPS = ', 2*n**3 / (t_end-t_start) / 1e9)

t_start = time.time()
B = matrix_multiply(A)
t_end = time.time()

print('Time taken = ', t_end-t_start)
print('GFLOPS = ', 2*n**3 / (t_end-t_start) / 1e9)

Time taken =  0.6430690288543701
GFLOPS =  3.110086025388297
Time taken =  0.03811812400817871
GFLOPS =  52.46847928746114

Setting parallel = True will use multiple cores with a significant speed up. It is also possible to use parallelism in Cython with OpenMP.

Other examples#

# Illustration of memory ordering
n = 10**4
A = rng.uniform(size = (n,n))
x = rng.uniform(size = n)

t_start = time.time()
for i in range(n):
    b = x * A[i,:]
t_end = time.time()

print('Time taken = ', t_end-t_start)

t_start = time.time()
for i in range(n):
    b = x * A[:,i]
t_end = time.time()

print('Time taken = ', t_end-t_start)

Time taken =  0.12569189071655273
Time taken =  0.24962282180786133

# Using built-in parallelization in scipy fft instead of numpy fft
n = 10**4
A = rng.uniform(size = (n,n))

t_start = time.time()
B = np.fft.fft(A)
t_end = time.time()
print('Time taken = ', t_end-t_start)

t_start = time.time()
B = scipy.fft.fft(A)
t_end = time.time()
print('Time taken = ', t_end-t_start)

t_start = time.time()
B = scipy.fft.fft(A, workers=8)
t_end = time.time()
print('Time taken = ', t_end-t_start)

Time taken =  1.6034882068634033
Time taken =  1.53358793258667
Time taken =  0.5826659202575684

# Demonstration of memory leak because we took a slice:
from  psutil import Process
import numpy as np

print("memory before=",Process().memory_info().rss)
arr = np.arange(0, 100_000_000)
print("memory after declaring arr=", Process().memory_info().rss)
# without the np.copy on the next line we will get a memory leak!!
small_slice = np.copy(arr[:10])
# try instead: 
#small_slice = arr[:10]
del arr
print("memory after deleting arr=",Process().memory_info().rss)

memory before= 943849472
memory after declaring arr= 544079872
memory after deleting arr= 138919936