< Home
5. Probability¶
Assignment¶
- Investigate the probability distribution of your data
- Set up template notebooks and slides for your data set analysis
Frequentist <-> Bayesian
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# === 1. Load dataset ===
df = pd.read_csv('./datasets/Wine_dataset.csv') # Modify file name if different
# === 2. Select only numeric columns (including class column if numeric) ===
numeric_cols = df.select_dtypes(include=[np.number]).columns
# === 3. Calculate layout for subplots automatically ===
num_cols = len(numeric_cols)
cols = 3 # Number of plots per row
rows = int(np.ceil(num_cols / cols))
plt.figure(figsize=(12, rows * 3)) # Figure size for entire grid
# === 4. Draw histogram for each numeric column ===
for i, col in enumerate(numeric_cols, start=1):
x = df[col].dropna()
plt.subplot(rows, cols, i)
plt.hist(x, bins=50, density=True, alpha=0.7)
# Plot spike marks showing raw data positions
plt.plot(x, 0*x, '|', ms=15)
# Calculate and plot Gaussian distribution curve
mean = x.mean()
std = x.std()
xi = np.linspace(mean - 3*std, mean + 3*std, 100)
yi = np.exp(-(xi-mean)**2 / (2*std**2)) / np.sqrt(2*np.pi*std**2)
plt.plot(xi, yi, 'r')
plt.title(col)
# Adjust layout and save output image
plt.tight_layout()
#plt.savefig("./hist_all_columns.png", dpi=150)
plt.show()
Averaging¶
I tried creating an averaging graph with "Alocohol" colum from my dataset referring to Neil's sample code.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('./datasets/Wine_dataset.csv')
data = df["Alcohol"].dropna().values
#mean = 1
#width = 2
trials = 100
points = np.arange(10,500,25)
means = np.zeros((trials,len(points)))
#
# loop over number of points
#
for point in range(len(points)):
#
# loop over trials
#
for trial in range(trials):
sample = np.random.choice(data, points[point], replace=True)
means[trial, point] = np.mean(sample)
#
# plot calculated width/sqrt(N)
#
true_mean = np.mean(data)
true_std = np.std(data)
plt.plot(points,true_mean + true_std/np.sqrt(points),'r',label='calculated')
plt.plot(points,true_mean + true_std/np.sqrt(points),'r')
#
# plot estimated means and stddevs
#
mean_est = np.mean(means,0)
stddev_est = np.std(means,0)
plt.errorbar(points,mean_est,yerr=stddev_est,fmt='k-o',capsize=7,label='estimated')
#
# plot points
#
for point in range(len(points)):
plt.plot(np.full(trials,points[point]),means[:,point],'o',markersize=2)
plt.xlabel('number of samples averaged')
plt.ylabel('mean estimates')
plt.legend()
plt.show()
Then I made each columns averaging graph with help of chatGPT asked "to make code with all columns and 3 graphs are displayed in one line"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# === Load dataset ===
df = pd.read_csv('./datasets/Wine_dataset.csv')
# === Select numeric columns ===
numeric_cols = df.select_dtypes(include=[np.number]).columns
# === Parameters for experiment ===
trials = 100
points = np.arange(10, 200, 10)
# === Layout settings ===
cols_per_row = 3
total = len(numeric_cols)
rows = (total + cols_per_row - 1) // cols_per_row # number of rows needed
fig, axes = plt.subplots(rows, cols_per_row, figsize=(10, 3*rows))
axes = axes.flatten()
for idx, col in enumerate(numeric_cols):
data = df[col].dropna().values
true_mean = np.mean(data)
means = np.zeros((trials, len(points)))
for p_i, N in enumerate(points):
for t in range(trials):
sample = np.random.choice(data, N, replace=True)
means[t, p_i] = np.mean(sample)
est_mean = np.mean(means, axis=0)
est_std = np.std(means, axis=0)
sigma = np.std(data)
upper = true_mean + sigma / np.sqrt(points)
lower = true_mean - sigma / np.sqrt(points)
# ====== change display style ======
ax = axes[idx]
ax.plot(points, upper, 'r', label='mean ± sigma/sqrt(N)')
ax.plot(points, lower, 'r')
ax.errorbar(points, est_mean, yerr=est_std,
fmt='k-o', capsize=5, label='estimated means')
for i in range(len(points)):
ax.plot(np.full(trials, points[i]), means[:, i],
'o', markersize=3, alpha=0.5)
# Display title
ax.set_title(f"{col}", fontsize=9)
# delete
for i in range(total, len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
Correlation Analysis¶
I got the advice of seeing the relation of each factor from the one of the classmate Tsuchiya-san
from scipy.stats import spearmanr
import numpy as np
import seaborn as sns
cols = df.columns
cormatrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
p_val_matrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
for colx in cols:
for coly in cols:
if colx == coly:
cormatrix.loc[colx,coly] = 1.0
p_val_matrix.loc[colx,coly] = 1.0
elif(pd.isna(cormatrix.loc[colx,coly])):
corr_coef,p_value = spearmanr(df[colx],df[coly])
cormatrix.loc[colx,coly] = corr_coef
cormatrix.loc[coly,colx] = corr_coef
p_val_matrix.loc[colx,coly] = p_value
p_val_matrix.loc[coly,colx] = p_value
plt.figure(figsize=(10,8))
sns.heatmap(p_val_matrix,annot=True,annot_kws={"size": 6},cmap="binary_r",vmin=0.0,vmax=0.05)
plt.title("p-value of spearman correlation in all variables")
plt.tight_layout()
#plt.savefig('./notupload/sp-pval.png')
plt.show()
plt.figure(figsize=(10,8))
sns.heatmap(cormatrix,annot=True,annot_kws={"size": 6}, cmap="coolwarm",vmin=-1.0,vmax=1.0)
plt.title("co-efficience of spearman correlation in all variables")
plt.tight_layout()
#plt.savefig('./notupload/spearman.png')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('./datasets/Wine_dataset.csv')
varsamples = df[["Alcohol", "Color intensity"]].values
npts = varsamples.shape[0]
np.random.seed(10)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
#
# find mean and variance
#
varmean = np.mean(varsamples,axis=0)
varstd = np.sqrt(np.var(varsamples,axis=0))
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
#
# generate covariance samples
#
#mean = [7,5]
#covariance = [[2.5,-2.1],[-2.1,2.5]]
#covarsamples = np.random.multivariate_normal(mean,covariance,npts)
#
# find mean, covariance, eigenvalues, and eigenvectors
#
covarmean = np.mean(covarsamples,axis=0)
covar = np.cov(covarsamples,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
#
# plot and print
#
print("covariance matrix:")
print(covar)
samples = np.vstack((varsamples,covarsamples))
plt.figure()
plt.hist2d(samples[:,0],samples[:,1],bins=30,cmap='binary')
plt.plot(samples[:,0],samples[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(varmean[0],varmean[1],'ro')
plt.plot(covarmean[0],covarmean[1],'ro')
plt.plot(varplotx,varploty,'r')
plt.plot(covarplotx,covarploty,'r')
plt.text(2.5,-4,"variance",fontsize=15)
plt.text(-1.25,8.5,"covariance",fontsize=15)
plt.axis('off')
plt.show()
#
# print covariance matrices
#
print("covariance matrix:")
varcovar = np.cov(varsamples,rowvar=False)
print(varcovar)
covariance matrix: [[ 2.41 -2.1 ] [-2.1 2.5 ]]
covariance matrix: [[0.66 1.03] [1.03 5.37]]
import numpy as np
import matplotlib.pyplot as plt
cols = df.columns
check_conb = []
for colx in cols:
for coly in cols:
if colx == coly:
continue
p = p_val_matrix.at[colx, coly]
if p < 0.05:
coeff = cormatrix.at[colx, coly]
check_conb.append([colx, coly, p, coeff])
# 重複削除(colx,coly と coly,colx は1つで良い)
for r in check_conb:
if r[0] == coly and r[1] == colx:
check_conb.remove(r)
# ====== ここから subplot でまとめて描画 ======
n = len(check_conb)
cols_plot = 5 # 1行に3つ
rows_plot = int(np.ceil(n / cols_plot))
plt.figure(figsize=(cols_plot * 8, rows_plot * 8))
for i, r in enumerate(check_conb, start=1):
a = df[r[0]].to_numpy()
b = df[r[1]].to_numpy()
samples = np.column_stack((a, b))
# ----- Variance -----
varmean = np.mean(samples, axis=0)
varstd = np.sqrt(np.var(samples, axis=0))
varplotx = [varmean[0]-varstd[0], varmean[0]+varstd[0], None, varmean[0], varmean[0]]
varploty = [varmean[1], varmean[1], None, varmean[1]-varstd[1], varmean[1]+varstd[1]]
# ----- Covariance -----
covarmean = np.mean(samples, axis=0)
covar = np.cov(samples, rowvar=False)
evalu, evect = np.linalg.eig(covar)
dx0 = evect[0, 0] * np.sqrt(evalu[0])
dx1 = evect[1, 0] * np.sqrt(evalu[1])
dy0 = evect[0, 1] * np.sqrt(evalu[0])
dy1 = evect[1, 1] * np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0, covarmean[0]+dx0, None,
covarmean[0]-dx1, covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0, covarmean[1]-dy0, None,
covarmean[1]+dy1, covarmean[1]-dy1]
# ====== subplot 描画 ======
plt.subplot(rows_plot, cols_plot, i)
plt.hist2d(samples[:, 0], samples[:, 1], bins=30, cmap='binary')
plt.plot(samples[:, 0], samples[:, 1], 'o', markersize=1.5, alpha=0.3)
plt.plot(varmean[0], varmean[1], 'go')
plt.plot(covarmean[0], covarmean[1], 'ro')
plt.plot(varplotx, varploty, 'g', lw=3)
plt.plot(covarplotx, covarploty, 'r', lw=3)
title = f"{r[0]} vs {r[1]}\np={r[2]:.4f}, corr={r[3]:.4f}"
plt.title(title, fontsize=14)
plt.axis('off')
plt.tight_layout()
plt.savefig('./datas/covariance_all_in_one.png')
#plt.show()
Information¶
Entropy¶
I check this code with my dataset using 'Alcohol' factor. I asked chatGPT the part where I should change.
import numpy as np
import matplotlib.pyplot as plt
nbins = 256
#xmin = -4
#xmax = 4
# change x to winedataset
x = df['Alcohol'].to_numpy()
print(f"{nbins} bins = {np.log2(nbins):.0f} bits")
#
def entropy(dist):
positives = dist[np.where(dist > 0)] # 0 log(0) = 0
return -np.sum(positives*np.log2(positives))
#
hist, edges = np.histogram(x, nbins)
wine_dist = hist / np.sum(hist)
#fig,axs = plt.subplots(3,1)
#fig.canvas.header_visible = False
#width = 1.1*(xmax-xmin)/nbins
#axs[0].bar(x,uniform)
#axs[0].set_title(f"uniform entropy: {entropy(uniform):.1f} bits")
#axs[1].bar(x,normal,width=width)
#axs[1].set_title(f"Gaussian entropy: {entropy(normal):.1f} bits")
#axs[2].bar(x,onehot,width=width)
#axs[2].set_title(f"one-hot entropy: {entropy(onehot):.1f} bits")
# --- 描画 ---
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.bar(edges[:-1], wine_dist, width=edges[1]-edges[0])
ax.set_title(f"Wine Alcohol entropy: {entropy(wine_dist):.2f} bits")
plt.tight_layout()
256 bins = 8 bits
I analysed the correlation between each item using entropy.
nbins = 256
# 1 dimension entropy
def entropy(dist):
index = np.where(dist > 0) # o log(0) = 0
positives = dist[index]
return -np.sum(positives*np.log2(positives))
# 2 dimension entropy
def entropy2(dist):
indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
positives = dist[indexx,indexy]
return -np.sum(positives*np.log2(positives))
# information
def information(x,y):
xhist,xedges = np.histogram(x,nbins)
xdist = xhist/np.sum(xhist)
yhist,yedges = np.histogram(y,nbins)
ydist = yhist/np.sum(yhist)
xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
xydist = xyhist/np.sum(xyhist)
Hx = entropy(xdist)
Hy = entropy(ydist)
Hxy = entropy2(xydist)
return Hx+Hy-Hxy
position = 1
np.set_printoptions(precision=4)
results = []
#plt.figure(figsize=(30,120))
#
# add here from chatGPT
#
from itertools import combinations
from scipy.stats import pearsonr
check_conb = []
for c1, c2 in combinations(df.columns, 2):
x = df[c1].to_numpy()
y = df[c2].to_numpy()
coeff, pval = pearsonr(x, y)
check_conb.append((c1, c2, pval, coeff))
for r in check_conb:
x = df[r[0]].to_numpy()
y = df[r[1]].to_numpy()
samples = np.column_stack((x,y))
covar = np.cov(samples,rowvar=False)
#print(f"{npts:.0e} points")
#print(f"covariance:\n{covar}")
#plt.subplot(18,3,position)
I = information(x,y)
#plt.scatter(x,y)
#plt.title(f"{r[0]},{r[1]} mutual information: {I:.1f} bits, coeff: {r[3]:.3f}",fontsize=14)
results.append([r[0],r[1],r[2],r[3],I,covar])
#plt.tight_layout()
#plt.savefig('./notupload/mi.png')
#plt.show()
pd.set_option('display.float_format', '{:.8f}'.format)
im_df = pd.DataFrame(results)
im_df.columns = ['val1','val2','p-val','coeff','IM','covar']
im_df = im_df.sort_values('IM',ascending=False)
im_df.to_csv('./imrank-sk.csv')
im_df.head(15)
position = 1
#plt.figure(figsize=(30,120))
nplots = len(im_df) # ← 追加
rows = (nplots + 2) // 3 # ← 追加(3列固定)
plt.figure(figsize=(30, 6*rows)) # ← 高さも自動が望ましい
for i,row in im_df.iterrows():
val1 = row['val1']
val2 = row['val2']
im = row['IM']
coeff = row['coeff']
covar = row['covar']
x = df[val1].to_numpy()
y = df[val2].to_numpy()
#plt.subplot(18,3,position)
plt.subplot(rows, 3, position)
plt.scatter(x,y)
plt.title(f"{val1},{val2},MI:{im:.1f} bits, coeff:{coeff:.3f}, covar:{covar}")
position = position + 1
plt.tight_layout()
plt.savefig('./images/mi-sk.png')
plt.show()
Mutal information¶
As I want to check this code with my dataset, 3 graphes are shown with this sample code.
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(10)
np.set_printoptions(precision=1)
np.set_printoptions(suppress=True)
npts = int(1e5)
nbins = 256
print(f"{nbins} bins\n")
#
def entropy(dist):
index = np.where(dist > 0) # 0 log(0) = 0
positives = dist[index]
return -np.sum(positives*np.log2(positives))
def entropy2(dist):
indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
positives = dist[indexx,indexy]
return -np.sum(positives*np.log2(positives))
def information(x,y):
xhist,xedges = np.histogram(x,nbins)
xdist = xhist/np.sum(xhist)
yhist,yedges = np.histogram(y,nbins)
ydist = yhist/np.sum(yhist)
xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
xydist = xyhist/np.sum(xyhist)
Hx = entropy(xdist)
Hy = entropy(ydist)
Hxy = entropy2(xydist)
return Hx+Hy-Hxy
#
# asked chatGPT to draw the graph using winedataset
#
# ==========================================
# 4種類の関係を Wine データから探して描く
# ==========================================
import matplotlib.pyplot as plt
# ---------- ① ランダム(相関もMIも小) ----------
rand_pair = im_df[
(im_df['coeff'].abs() < 0.1) &
(im_df['IM'] < im_df['IM'].quantile(0.2))
].iloc[0]
# ---------- ③ 非線形(相関は小、MIは大) ----------
nonlinear_pair = im_df[
(im_df['coeff'].abs() < 0.2)
].sort_values('IM', ascending=False).iloc[0]
# ---------- ④ 直線(相関もMIも大) ----------
linear_pair = im_df[
im_df['coeff'].abs() > 0.7
].sort_values('IM', ascending=False).iloc[0]
pairs = [
("Random", rand_pair),
("Nonlinear", nonlinear_pair),
("Linear", linear_pair),
]
# ---------- 描画 ----------
plt.figure(figsize=(15,4))
for i, (label, row) in enumerate(pairs, 1):
x = df[row['val1']]
y = df[row['val2']]
plt.subplot(1, 3, i)
plt.scatter(x, y)
plt.xlabel(row['val1'])
plt.ylabel(row['val2'])
plt.title(
f"{label}\n"
f"coeff={row['coeff']:.2f}, MI={row['IM']:.2f}"
)
plt.tight_layout()
plt.show()
256 bins
I found that 'Alcalinity of ash' and 'Magnesium'(left), and 'Flavonoids' and 'OD280/OD315 of diluted wines'(right) seem tohave correlation, but center one has a lot of random data points.