Week 3: Probability, correlation, mutual information¶
Load killer whale tag data¶
Reloading the killer whale data set from week 1 here and a few steps that may come in handy.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
df = pd.read_csv("datasets/data_oo23_181a.csv", sep=",") # import as data frame
df = df.iloc[::10].reset_index(drop=True) # Keep every 10th row
df[['pitch','roll','head']] = df[['pitch','roll','head']]/math.pi*180 # Convert prh from radians to degrees
data = df.iloc[:, 1:].to_numpy(dtype=float) # Convert to numpy array but skip the time stamps
t = np.arange(data.shape[0])/3600*10 # time in hours since start
Exploratory data analysis¶
Pair plot with correlation coefficients¶
Since I had the pair plot from week 1, I asked ChatGPT the following:
- How do I plot the values of the correlation coefs along with my pair plot made with the seaborn library? So far I have: # Pair Plot sns.pairplot(df[['depth','pitch','roll','head','ax','ay','az']]) #corner='true') plt.show()
I also asked to update the code for Spearman's rho so that I could plot both values side-by-side.
from scipy.stats import spearmanr
import seaborn as sns
# Generate pairplot
g = sns.pairplot(
df[['depth','pitch','roll','head','ax','ay','az']],
corner=True, # remove upper triangle
plot_kws=dict(s=12, alpha=0.4)
)
# ---- Function to annotate correlations ----
def corr_coefficient(x, y, **kwargs):
r = np.corrcoef(x, y)[0,1]
rho, p = spearmanr(x, y)
ax = plt.gca()
ax.annotate(
f"r: {r:.2f} ρ: {rho:.2f}",
xy=(0.5, 0.5),
xycoords=ax.transAxes,
ha="center", va="center",
fontsize=14, fontweight="bold"
)
#ax.set_axis_off()
# ---- Apply function to lower triangle ----
g.map_lower(corr_coefficient)
plt.show()
Some funky nonlinear correlations going on that are poorly captured by the correlation coefficients...
Pearson's r measures linear relationships and Spearman's rho monotonic relationships. So, the coefs for Ax-pitch for example make intuitive sense. Ay-roll coefs required a closer look, but probably because most roll values are close to 0 deg.
Depth and Az distributions are highly skewed, as expected, which results in larger differences between r and rho.
Examples of 2D scatterplots with eigenvectors¶
Let's hone in on three Acceleration vs Orientation relationships and copy-paste some code from class. Probably should loop these..
print(df.columns) # column names
pitchax = data[:,[3,6]]
rollay = data[:,[4,7]]
rollaz = data[:,[4,8]]
Index(['t', 'lat', 'lon', 'depth', 'pitch', 'roll', 'head', 'ax', 'ay', 'az'], dtype='object')
# find mean, covariance, eigenvalues, and eigenvectors
covarmean = np.mean(pitchax,axis=0)
covar = np.cov(pitchax,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
covarmean2 = np.mean(rollay,axis=0)
covar2 = np.cov(rollay,rowvar=False)
evalu,evect = np.linalg.eig(covar2)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx2 = [covarmean2[0]-dx0,covarmean2[0]+dx0,None,covarmean2[0]-dx1,covarmean2[0]+dx1]
covarploty2 = [covarmean2[1]+dy0,covarmean2[1]-dy0,None,covarmean2[1]+dy1,covarmean2[1]-dy1]
covarmean3 = np.mean(rollaz,axis=0)
covar3 = np.cov(rollaz,rowvar=False)
evalu,evect = np.linalg.eig(covar3)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx3 = [covarmean3[0]-dx0,covarmean3[0]+dx0,None,covarmean3[0]-dx1,covarmean3[0]+dx1]
covarploty3 = [covarmean3[1]+dy0,covarmean3[1]-dy0,None,covarmean3[1]+dy1,covarmean3[1]-dy1]
# plot and print covariance matrices
plt.figure(figsize=(8, 4))
plt.subplot(1,3,1)
plt.plot(pitchax[:,0],pitchax[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(covarmean[0],covarmean[1],'ro')
plt.plot(covarplotx,covarploty,'r')
plt.xlabel('Pitch (deg)')
plt.ylabel('Ax (g)')
plt.subplot(1,3,2)
plt.plot(rollay[:,0],rollay[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(covarmean2[0],covarmean2[1],'ro')
plt.plot(covarplotx2,covarploty2,'r')
plt.xlabel('Roll (deg)')
plt.ylabel('Ay (g)')
plt.subplot(1,3,3)
plt.plot(rollaz[:,0],rollaz[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(covarmean3[0],covarmean3[1],'ro')
plt.plot(covarplotx3,covarploty3,'r')
plt.xlabel('Roll (deg)')
plt.ylabel('Az (g)')
plt.tight_layout()
#plt.show()
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
print("covariance matrices:")
print(covar)
print(covar2)
print(covar3)
covariance matrices: [[313.7 4.8 ] [ 4.8 0.07]] [[950.06 5.11] [ 5.11 0.05]] [[950.06 -0.81] [ -0.81 0.11]]
By far the greatest probability density near pitch=0, roll=0 so I left out the density surfaces.
Bit harder to interpret the covariance terms of these variables than the correlation coefficients because of the differences in scale.
Examples of entropy¶
I feel like the histogram bin size thing is going to be an issue for my variables so I will compute the Shannon Entropy, which is based on KDE, so a smoothed distribution. My ChatGPT queries:
- How do i calculate entropy from timeseries data without using histogram?
- Would like to use Shannon entropy using kernel density estimation (KDE). Give me example of smooth kde plot with its value.
from sklearn.neighbors import KernelDensity
roll = data[:,4]
ay = data[:,7]
roll_reshaped = roll.reshape(-1, 1) # 2D vector required for sklearn
ay_reshaped = ay.reshape(-1, 1)
# Fit KDE (Gaussian kernel) for roll and make curve
bandwidth = 0.4
kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(roll_reshaped)
xvec = np.linspace(roll.min(), roll.max(), 1000).reshape(-1, 1)
dens = np.exp(kde.score_samples(xvec))
# Shannon entropy in base-2 (bits)
samples = kde.sample(2000) # sample from KDE
log_probs_nats = kde.score_samples(samples)
entropy_nats = -np.mean(log_probs_nats)
entropy_bits = entropy_nats / np.log(2) # convert to bits
# Generate first two KDE plots
plt.figure(figsize=(8,4))
plt.subplot(1,3,1)
plt.plot(xvec[:,0], dens, lw=2)
plt.title(f"B={bandwidth}, entropy = {entropy_bits:.4f} bits", fontsize=8)
plt.xlabel("Roll (deg)")
plt.ylabel("Density")
plt.grid(True)
plt.subplot(1,3,2) # zoomed version for roll
plt.plot(xvec[:,0], dens, lw=2)
plt.title(f"B={bandwidth}, entropy = {entropy_bits:.4f} bits", fontsize=8)
plt.xlabel("Roll (deg)")
plt.grid(True)
plt.xlim(-60, 60)
# Fit KDE (Gaussian kernel) for Ay and make curve
bandwidth2 = 0.01
kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth2).fit(ay_reshaped)
xvec = np.linspace(ay.min(), ay.max(), 1000).reshape(-1, 1)
dens = np.exp(kde.score_samples(xvec))
# Shannon entropy in base-2 (bits)
samples = kde.sample(2000) # sample from KDE
log_probs_nats = kde.score_samples(samples)
entropy_nats = -np.mean(log_probs_nats)
entropy_bits = entropy_nats / np.log(2) # convert to bits
# Generate KDE plot
plt.subplot(1,3,3)
plt.plot(xvec[:,0], dens, lw=2)
plt.title(f"B={bandwidth2}, entropy = {entropy_bits:.4f} bits", fontsize=8)
plt.xlabel("Ay (g)")
plt.grid(True)
plt.tight_layout()
Not really the entropy values that I expected since the shapes are similar. I will normalise the variables and compare again.
Sensitivity to kernel bandwidth seems to be limited, although very high bandwidths (smoothing) do increase it by ~1-2 bits.
# Standardise variables (mean-centred and divided by std)
roll_norm = (roll - roll.mean()) / roll.std()
ay_norm = (ay - ay.mean()) / ay.std()
roll_reshaped = roll_norm.reshape(-1, 1) # 2D vector required for sklearn
ay_reshaped = ay_norm.reshape(-1, 1)
# Fit KDE (Gaussian kernel) for roll and make curve
bandwidth = 0.05
kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(roll_reshaped)
xvec = np.linspace(roll_norm.min(), roll_norm.max(), 1000).reshape(-1, 1)
dens = np.exp(kde.score_samples(xvec))
# Shannon entropy in base-2 (bits)
samples = kde.sample(2000) # sample from KDE
log_probs_nats = kde.score_samples(samples)
entropy_nats = -np.mean(log_probs_nats)
entropy_bits = entropy_nats / np.log(2) # convert to bits
# Generate first KDE plot
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.plot(xvec[:,0], dens, lw=2)
plt.title(f"B={bandwidth}, entropy = {entropy_bits:.4f} bits", fontsize=8)
plt.xlabel("Normalised roll")
plt.ylabel("Density")
plt.grid(True)
# Fit KDE (Gaussian kernel) for Ay and make curve
bandwidth2 = 0.05
kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth2).fit(ay_reshaped)
xvec = np.linspace(ay_norm.min(), ay_norm.max(), 1000).reshape(-1, 1)
dens = np.exp(kde.score_samples(xvec))
# Shannon entropy in base-2 (bits)
samples = kde.sample(2000) # sample from KDE
log_probs_nats = kde.score_samples(samples)
entropy_nats = -np.mean(log_probs_nats)
entropy_bits = entropy_nats / np.log(2) # convert to bits
# Generate the second KDE plot
plt.subplot(1,2,2)
plt.plot(xvec[:,0], dens, lw=2)
plt.title(f"B={bandwidth2}, entropy = {entropy_bits:.4f} bits", fontsize=8)
plt.xlabel("Normalised Ay")
plt.grid(True)
plt.tight_layout()
Interesting, much more similar now. I guess the range matters... Looks like the difference is due to the standarisation:
log2_std = np.log2(roll.std())
print('base-2 log of the sd of roll: ', log2_std)
log2_std = np.log2(ay.std())
print('base-2 log of the sd of Ay: ', log2_std)
base-2 log of the sd of roll: 4.945852935149726 base-2 log of the sd of Ay: -2.11626212998576
OK, I gather it's basically useless to compare entropy between variables but mutual information can be interesting.
Mutual information¶
Combining the code from class (histogram-based entropy) with pairplot below. Values are sensitive to nbins.
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
nbins = 256 # no. histogram bins
def entropy(dist):
index = np.where(dist > 0) # 0 log(0) = 0
positives = dist[index]
return -np.sum(positives*np.log2(positives))
def entropy2(dist):
indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
positives = dist[indexx,indexy]
return -np.sum(positives*np.log2(positives))
def information(x,y, **kwargs):
xhist,xedges = np.histogram(x,nbins)
xdist = xhist/np.sum(xhist)
yhist,yedges = np.histogram(y,nbins)
ydist = yhist/np.sum(yhist)
xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
xydist = xyhist/np.sum(xyhist)
Hx = entropy(xdist)
Hy = entropy(ydist)
Hxy = entropy2(xydist)
I = Hx+Hy-Hxy
ax = plt.gca()
ax.annotate(
f"{I:.1f} bits",
xy=(0.5, 0.5),
xycoords=ax.transAxes,
ha="center", va="center",
fontsize=14, fontweight="bold"
)
# Generate pairplot
g = sns.pairplot(
df[['depth','pitch','roll','head','ax','ay','az']],
corner=True, # remove upper triangle
plot_kws=dict(s=12, alpha=0.4)
)
# ---- Apply function to lower triangle ----
g.map_lower(information)
plt.show()
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi,voronoi_plot_2d
import numpy as np
import time
## y = -depth and x = pitch, to save time
eps = 1e-12
y = -data[:,2]
x = data[:,3]
# Gaussuam mixture model parameters
npts = 8892
nclusters = 2
nsteps = 25
nplot = 100
np.random.seed(0)
# choose starting points and initialize
indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
varx = (np.max(x)-np.min(x))**2
vary = (np.max(y)-np.min(y))**2
pc = np.ones(nclusters)/nclusters
# do E-M iterations
#
for i in range(nsteps):
#
# construct matrices
#
xm = np.outer(x,np.ones(nclusters))
ym = np.outer(y,np.ones(nclusters))
muxm = np.outer(np.ones(npts),mux)
muym = np.outer(np.ones(npts),muy)
varxm = np.outer(np.ones(npts),varx)
varym = np.outer(np.ones(npts),varx)
pcm = np.outer(np.ones(npts),pc)
#
# use model to update probabilities
#
pvgc = (1/np.sqrt(2*np.pi*varxm))*\
np.exp(-(xm-muxm)**2/(2*varxm))*\
(1/np.sqrt(2*np.pi*varym))*\
np.exp(-(ym-muym)**2/(2*varym))
pvc = pvgc*np.outer(np.ones(npts),pc)
pcgv = pvc/np.outer(np.sum(pvc,1),np.ones(nclusters))
#
# use probabilities to update model
#
pc = np.sum(pcgv,0)/npts
mux = np.sum(xm*pcgv,0)/(npts*pc)
muy = np.sum(ym*pcgv,0)/(npts*pc)
varx = 0.1+np.sum((xm-muxm)**2*pcgv,0)/(npts*pc)
vary = 0.1+np.sum((ym-muym)**2*pcgv,0)/(npts*pc)
# Plot data with GMM means and sds
fig,ax = plt.subplots()
plt.plot(x,y,'.')
plt.errorbar(mux,muy,xerr=np.sqrt(varx),yerr=np.sqrt(vary),fmt='r.',markersize=20)
plt.autoscale()
plt.xlabel("Pitch (deg)")
plt.ylabel("Depth (m)")
plt.show()
# plot predicted probability surface
xplot = np.linspace(np.min(x),np.max(x),nplot)
yplot = np.linspace(np.min(y),np.max(y),nplot)
(X,Y) = np.meshgrid(xplot,yplot)
p = np.zeros((nplot,nplot))
for c in range(nclusters):
p += np.exp(-(X-mux[c])**2/(2*varx[c]))/np.sqrt(2*np.pi*varx[c])\
*np.exp(-(Y-muy[c])**2/(2*vary[c]))/np.sqrt(2*np.pi*vary[c])\
*pc[c]
fig, ax = plt.subplots(subplot_kw={"projection":"3d"})
#eps = 1e-12
#ax.plot_surface(X, Y, np.log(p + eps))
ax.plot_surface(X, Y, p)
ax.set_ylabel("depth (m)")
ax.set_xlabel("pitch (deg)")
#plt.title("log(probability)")
plt.show()
Killer whale spent most time, by far, near the surface and at shallow pitch angles. At deeper depths, below 10 m or so, the variation in pitch was greater, i.e. the whale made steeper ascents and descents.