[Maki TANAKA] - Fab Futures - Data Science
Home About

< Home

5. Probability¶

Assignment¶

  • Investigate the probability distribution of your data
  • Set up template notebooks and slides for your data set analysis

Frequentist <-> Bayesian

Histogram¶

Density Estimation¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# === 1. Load dataset ===
df = pd.read_csv('./datasets/Wine_dataset.csv')   # Modify file name if different
In [3]:
# === 2. Select only numeric columns (including class column if numeric) ===
numeric_cols = df.select_dtypes(include=[np.number]).columns

# === 3. Calculate layout for subplots automatically ===
num_cols = len(numeric_cols)
cols = 3                                   # Number of plots per row
rows = int(np.ceil(num_cols / cols))

plt.figure(figsize=(12, rows * 3))          # Figure size for entire grid

# === 4. Draw histogram for each numeric column ===
for i, col in enumerate(numeric_cols, start=1):
    x = df[col].dropna()

    plt.subplot(rows, cols, i)
    plt.hist(x, bins=50, density=True, alpha=0.7)

    # Plot spike marks showing raw data positions
    plt.plot(x, 0*x, '|', ms=15)

    # Calculate and plot Gaussian distribution curve
    mean = x.mean()
    std = x.std()

    xi = np.linspace(mean - 3*std, mean + 3*std, 100)
    yi = np.exp(-(xi-mean)**2 / (2*std**2)) / np.sqrt(2*np.pi*std**2)
    plt.plot(xi, yi, 'r')

    plt.title(col)

# Adjust layout and save output image
plt.tight_layout()
#plt.savefig("./hist_all_columns.png", dpi=150)
plt.show()
No description has been provided for this image

Averaging¶

I tried creating an averaging graph with "Alocohol" colum from my dataset referring to Neil's sample code.

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('./datasets/Wine_dataset.csv')
data = df["Alcohol"].dropna().values

#mean = 1
#width = 2
trials = 100
points = np.arange(10,500,25)
means = np.zeros((trials,len(points)))
#
# loop over number of points
#
for point in range(len(points)):
    #
    # loop over trials
    #
    for trial in range(trials):
        sample = np.random.choice(data, points[point], replace=True)
        means[trial, point] = np.mean(sample)
        
#
# plot calculated width/sqrt(N) 
#
true_mean = np.mean(data)
true_std = np.std(data)

plt.plot(points,true_mean + true_std/np.sqrt(points),'r',label='calculated')
plt.plot(points,true_mean + true_std/np.sqrt(points),'r')
#
# plot estimated means and stddevs
#
mean_est = np.mean(means,0)
stddev_est = np.std(means,0)
plt.errorbar(points,mean_est,yerr=stddev_est,fmt='k-o',capsize=7,label='estimated')
#
# plot points
#
for point in range(len(points)):
    plt.plot(np.full(trials,points[point]),means[:,point],'o',markersize=2)
plt.xlabel('number of samples averaged')
plt.ylabel('mean estimates')
plt.legend()
plt.show()
No description has been provided for this image

Then I made each columns averaging graph with help of chatGPT asked "to make code with all columns and 3 graphs are displayed in one line"

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# === Load dataset ===
df = pd.read_csv('./datasets/Wine_dataset.csv')

# === Select numeric columns ===
numeric_cols = df.select_dtypes(include=[np.number]).columns

# === Parameters for experiment ===
trials = 100
points = np.arange(10, 200, 10)

# === Layout settings ===
cols_per_row = 3
total = len(numeric_cols)
rows = (total + cols_per_row - 1) // cols_per_row  # number of rows needed

fig, axes = plt.subplots(rows, cols_per_row, figsize=(10, 3*rows))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):

    data = df[col].dropna().values
    true_mean = np.mean(data)

    means = np.zeros((trials, len(points)))

    for p_i, N in enumerate(points):
        for t in range(trials):
            sample = np.random.choice(data, N, replace=True)
            means[t, p_i] = np.mean(sample)

    est_mean = np.mean(means, axis=0)
    est_std = np.std(means, axis=0)

    sigma = np.std(data)
    upper = true_mean + sigma / np.sqrt(points)
    lower = true_mean - sigma / np.sqrt(points)

    # ====== change display style ======
    ax = axes[idx]
    ax.plot(points, upper, 'r', label='mean ± sigma/sqrt(N)')
    ax.plot(points, lower, 'r')

    ax.errorbar(points, est_mean, yerr=est_std,
                 fmt='k-o', capsize=5, label='estimated means')

    for i in range(len(points)):
        ax.plot(np.full(trials, points[i]), means[:, i],
                 'o', markersize=3, alpha=0.5)

    # Display title
    ax.set_title(f"{col}", fontsize=9)

# delete 
for i in range(total, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()
No description has been provided for this image

Correlation Analysis¶

I got the advice of seeing the relation of each factor from the one of the classmate Tsuchiya-san

In [4]:
from scipy.stats import spearmanr
import numpy as np
import seaborn as sns

cols = df.columns

cormatrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
p_val_matrix = pd.DataFrame(index=cols,columns=cols,dtype=float)

for colx in cols:
    for coly in cols:
        if colx == coly:
            cormatrix.loc[colx,coly] = 1.0
            p_val_matrix.loc[colx,coly] = 1.0
        elif(pd.isna(cormatrix.loc[colx,coly])):
            corr_coef,p_value = spearmanr(df[colx],df[coly])

            cormatrix.loc[colx,coly] = corr_coef
            cormatrix.loc[coly,colx] = corr_coef
            p_val_matrix.loc[colx,coly] = p_value
            p_val_matrix.loc[coly,colx] = p_value

plt.figure(figsize=(10,8))
sns.heatmap(p_val_matrix,annot=True,annot_kws={"size": 6},cmap="binary_r",vmin=0.0,vmax=0.05)
plt.title("p-value of spearman correlation in all variables")
plt.tight_layout()
#plt.savefig('./notupload/sp-pval.png')
plt.show()

plt.figure(figsize=(10,8))
sns.heatmap(cormatrix,annot=True,annot_kws={"size": 6}, cmap="coolwarm",vmin=-1.0,vmax=1.0)
plt.title("co-efficience of spearman correlation in all variables")
plt.tight_layout()
#plt.savefig('./notupload/spearman.png')
plt.show()
No description has been provided for this image
No description has been provided for this image

Multidimensional distributions¶

Covariance matrix¶

As I see upper heatmaps, I found "Alcohol" and "Color intensity" seem to have strong influence each other.
So I tried to see covariance matrix of those 2 factors using Neil's covariance matrix code.

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('./datasets/Wine_dataset.csv')

varsamples = df[["Alcohol", "Color intensity"]].values
npts = varsamples.shape[0]

np.random.seed(10)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

#
# find mean and variance
#
varmean = np.mean(varsamples,axis=0)
varstd = np.sqrt(np.var(varsamples,axis=0))
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
#
# generate covariance samples
#
#mean = [7,5]
#covariance = [[2.5,-2.1],[-2.1,2.5]]
#covarsamples = np.random.multivariate_normal(mean,covariance,npts)
#
# find mean, covariance, eigenvalues, and eigenvectors
#
covarmean = np.mean(covarsamples,axis=0)
covar = np.cov(covarsamples,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
#
# plot and print
#
print("covariance matrix:")
print(covar)
samples = np.vstack((varsamples,covarsamples))
plt.figure()
plt.hist2d(samples[:,0],samples[:,1],bins=30,cmap='binary')
plt.plot(samples[:,0],samples[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(varmean[0],varmean[1],'ro')
plt.plot(covarmean[0],covarmean[1],'ro')
plt.plot(varplotx,varploty,'r')
plt.plot(covarplotx,covarploty,'r')
plt.text(2.5,-4,"variance",fontsize=15)
plt.text(-1.25,8.5,"covariance",fontsize=15)
plt.axis('off')
plt.show()
#
# print covariance matrices
#
print("covariance matrix:")
varcovar = np.cov(varsamples,rowvar=False)
print(varcovar)
covariance matrix:
[[ 2.41 -2.1 ]
 [-2.1   2.5 ]]
No description has been provided for this image
covariance matrix:
[[0.66 1.03]
 [1.03 5.37]]
In [ ]:
 
In [5]:
import numpy as np
import matplotlib.pyplot as plt

cols = df.columns

check_conb = []
for colx in cols:
    for coly in cols:
        if colx == coly:
            continue

        p = p_val_matrix.at[colx, coly]

        if p < 0.05:
            coeff = cormatrix.at[colx, coly]
            check_conb.append([colx, coly, p, coeff])

        # 重複削除(colx,coly と coly,colx は1つで良い)
        for r in check_conb:
            if r[0] == coly and r[1] == colx:
                check_conb.remove(r)


# ====== ここから subplot でまとめて描画 ======

n = len(check_conb)
cols_plot = 5                         # 1行に3つ
rows_plot = int(np.ceil(n / cols_plot))

plt.figure(figsize=(cols_plot * 8, rows_plot * 8))

for i, r in enumerate(check_conb, start=1):

    a = df[r[0]].to_numpy()
    b = df[r[1]].to_numpy()

    samples = np.column_stack((a, b))

    # ----- Variance -----
    varmean = np.mean(samples, axis=0)
    varstd = np.sqrt(np.var(samples, axis=0))
    varplotx = [varmean[0]-varstd[0], varmean[0]+varstd[0], None, varmean[0], varmean[0]]
    varploty = [varmean[1], varmean[1], None, varmean[1]-varstd[1], varmean[1]+varstd[1]]

    # ----- Covariance -----
    covarmean = np.mean(samples, axis=0)
    covar = np.cov(samples, rowvar=False)
    evalu, evect = np.linalg.eig(covar)

    dx0 = evect[0, 0] * np.sqrt(evalu[0])
    dx1 = evect[1, 0] * np.sqrt(evalu[1])
    dy0 = evect[0, 1] * np.sqrt(evalu[0])
    dy1 = evect[1, 1] * np.sqrt(evalu[1])

    covarplotx = [covarmean[0]-dx0, covarmean[0]+dx0, None,
                  covarmean[0]-dx1, covarmean[0]+dx1]
    covarploty = [covarmean[1]+dy0, covarmean[1]-dy0, None,
                  covarmean[1]+dy1, covarmean[1]-dy1]

    # ====== subplot 描画 ======
    plt.subplot(rows_plot, cols_plot, i)

    plt.hist2d(samples[:, 0], samples[:, 1], bins=30, cmap='binary')
    plt.plot(samples[:, 0], samples[:, 1], 'o', markersize=1.5, alpha=0.3)
    plt.plot(varmean[0], varmean[1], 'go')
    plt.plot(covarmean[0], covarmean[1], 'ro')
    plt.plot(varplotx, varploty, 'g', lw=3)
    plt.plot(covarplotx, covarploty, 'r', lw=3)

    title = f"{r[0]} vs {r[1]}\np={r[2]:.4f}, corr={r[3]:.4f}"
    plt.title(title, fontsize=14)
    plt.axis('off')

plt.tight_layout()
plt.savefig('./datas/covariance_all_in_one.png')
#plt.show()
No description has been provided for this image

Information¶

Entropy¶

I check this code with my dataset using 'Alcohol' factor. I asked chatGPT the part where I should change.

In [3]:
import numpy as np
import matplotlib.pyplot as plt
nbins = 256
#xmin = -4
#xmax = 4
# change x to winedataset
x = df['Alcohol'].to_numpy()

print(f"{nbins} bins = {np.log2(nbins):.0f} bits")
#
def entropy(dist):
    positives = dist[np.where(dist > 0)] # 0 log(0) = 0
    return -np.sum(positives*np.log2(positives))
#
hist, edges = np.histogram(x, nbins)
wine_dist = hist / np.sum(hist)

#fig,axs = plt.subplots(3,1)
#fig.canvas.header_visible = False
#width = 1.1*(xmax-xmin)/nbins
#axs[0].bar(x,uniform)
#axs[0].set_title(f"uniform entropy: {entropy(uniform):.1f} bits")
#axs[1].bar(x,normal,width=width)
#axs[1].set_title(f"Gaussian entropy: {entropy(normal):.1f} bits")
#axs[2].bar(x,onehot,width=width)
#axs[2].set_title(f"one-hot entropy: {entropy(onehot):.1f} bits")

# --- 描画 ---
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.bar(edges[:-1], wine_dist, width=edges[1]-edges[0])
ax.set_title(f"Wine Alcohol entropy: {entropy(wine_dist):.2f} bits")


plt.tight_layout()
256 bins = 8 bits
No description has been provided for this image
In [ ]:
I analysed the correlation between each item using entropy.
In [4]:
nbins = 256
# 1 dimension entropy
def entropy(dist):
    index = np.where(dist > 0) # o log(0) = 0
    positives = dist[index]
    return -np.sum(positives*np.log2(positives))

# 2 dimension entropy
def entropy2(dist):
    indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
    positives = dist[indexx,indexy]
    return -np.sum(positives*np.log2(positives))

# information
def information(x,y):
    xhist,xedges = np.histogram(x,nbins)
    xdist = xhist/np.sum(xhist)
    yhist,yedges = np.histogram(y,nbins)
    ydist = yhist/np.sum(yhist)
    xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
    xydist = xyhist/np.sum(xyhist)
    Hx = entropy(xdist)
    Hy = entropy(ydist)
    Hxy = entropy2(xydist)

    return Hx+Hy-Hxy
In [5]:
position = 1
np.set_printoptions(precision=4)

results = []
#plt.figure(figsize=(30,120))
#
# add here from chatGPT
#
from itertools import combinations
from scipy.stats import pearsonr

check_conb = []

for c1, c2 in combinations(df.columns, 2):
    x = df[c1].to_numpy()
    y = df[c2].to_numpy()
    coeff, pval = pearsonr(x, y)
    check_conb.append((c1, c2, pval, coeff))

for r in check_conb:
    
    x = df[r[0]].to_numpy()
    y = df[r[1]].to_numpy()
    
    samples = np.column_stack((x,y))
    covar = np.cov(samples,rowvar=False)
    #print(f"{npts:.0e} points")
    #print(f"covariance:\n{covar}")

    #plt.subplot(18,3,position)
    I = information(x,y)
    #plt.scatter(x,y)
    #plt.title(f"{r[0]},{r[1]} mutual information: {I:.1f} bits, coeff: {r[3]:.3f}",fontsize=14)

    results.append([r[0],r[1],r[2],r[3],I,covar])


#plt.tight_layout()
#plt.savefig('./notupload/mi.png')
#plt.show()

pd.set_option('display.float_format', '{:.8f}'.format)
im_df = pd.DataFrame(results)
im_df.columns = ['val1','val2','p-val','coeff','IM','covar']
im_df = im_df.sort_values('IM',ascending=False)
im_df.to_csv('./imrank-sk.csv')

im_df.head(15)
position = 1
#plt.figure(figsize=(30,120))
nplots = len(im_df)          # ← 追加
rows = (nplots + 2) // 3     # ← 追加(3列固定)

plt.figure(figsize=(30, 6*rows))  # ← 高さも自動が望ましい

for i,row in im_df.iterrows():
    val1 = row['val1']
    val2 = row['val2']
    im = row['IM']
    coeff = row['coeff']
    covar = row['covar']
    
    x = df[val1].to_numpy()
    y = df[val2].to_numpy()
    #plt.subplot(18,3,position)
    plt.subplot(rows, 3, position)
    plt.scatter(x,y)
    plt.title(f"{val1},{val2},MI:{im:.1f} bits, coeff:{coeff:.3f}, covar:{covar}")

    position = position + 1

plt.tight_layout()
plt.savefig('./images/mi-sk.png')
plt.show() 
No description has been provided for this image

Mutal information¶

As I want to check this code with my dataset, 3 graphes are shown with this sample code.

In [7]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(10)
np.set_printoptions(precision=1)
np.set_printoptions(suppress=True)
npts = int(1e5)
nbins = 256
print(f"{nbins} bins\n")
#
def entropy(dist):
    index = np.where(dist > 0) # 0 log(0) = 0
    positives = dist[index]
    return -np.sum(positives*np.log2(positives))
def entropy2(dist):
    indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
    positives = dist[indexx,indexy]
    return -np.sum(positives*np.log2(positives))
def information(x,y):
    xhist,xedges = np.histogram(x,nbins)
    xdist = xhist/np.sum(xhist)
    yhist,yedges = np.histogram(y,nbins)
    ydist = yhist/np.sum(yhist)
    xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
    xydist = xyhist/np.sum(xyhist)
    Hx = entropy(xdist)
    Hy = entropy(ydist)
    Hxy = entropy2(xydist)
    return Hx+Hy-Hxy
#
# asked chatGPT to draw the graph using winedataset
#
# ==========================================
# 4種類の関係を Wine データから探して描く
# ==========================================

import matplotlib.pyplot as plt

# ---------- ① ランダム(相関もMIも小) ----------
rand_pair = im_df[
    (im_df['coeff'].abs() < 0.1) &
    (im_df['IM'] < im_df['IM'].quantile(0.2))
].iloc[0]

# ---------- ③ 非線形(相関は小、MIは大) ----------
nonlinear_pair = im_df[
    (im_df['coeff'].abs() < 0.2)
].sort_values('IM', ascending=False).iloc[0]

# ---------- ④ 直線(相関もMIも大) ----------
linear_pair = im_df[
    im_df['coeff'].abs() > 0.7
].sort_values('IM', ascending=False).iloc[0]

pairs = [
    ("Random", rand_pair),
    ("Nonlinear", nonlinear_pair),
    ("Linear", linear_pair),
]

# ---------- 描画 ----------
plt.figure(figsize=(15,4))

for i, (label, row) in enumerate(pairs, 1):
    x = df[row['val1']]
    y = df[row['val2']]

    plt.subplot(1, 3, i)
    plt.scatter(x, y)
    plt.xlabel(row['val1'])
    plt.ylabel(row['val2'])
    plt.title(
        f"{label}\n"
        f"coeff={row['coeff']:.2f}, MI={row['IM']:.2f}"
    )

plt.tight_layout()
plt.show()
256 bins

No description has been provided for this image

I found that 'Alcalinity of ash' and 'Magnesium'(left), and 'Flavonoids' and 'OD280/OD315 of diluted wines'(right) seem tohave correlation, but center one has a lot of random data points.

In [ ]: