Class¶
Assignment ¶
- Investigate the probability distribution of your data
1. Histogram¶¶
- Histogram bins, density, and weight
- we can also normalize the bar lengths as a probability density function using the density parameter. The sum of all the bin widths multiplied by their heights equals 1.
- Rug Plot: a plot of data for a single quantitative variable, displayed as marks along an axis. It is used to visualise the distribution of the data
Distribution of total Git Commis (Fab Academy)¶
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#--- FA Commits data ---
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
commit_total = df['commit_total'].to_numpy() # 生徒ごとのコミット数合計
npts = len(commit_total) # num of students
print(npts)
x = commit_total
#--- plot histogram
#plt.hist(x, bins=npts//50, density=True)
bins = np.arange(0, commit_total.max() + 50, 50)
plt.hist(x, bins, density=True) # density = True
plt.plot(x, 0*x, '|', ms=npts/20) # rug plot
plt.xlabel("total commits")
plt.ylabel("probability density")
plt.title("Distribution of total commits (Fab Academy)")
plt.show()
1509
Observations:
- The distribution shows a heavy long-tail shape: a small number of events occur with high probability, while a large number of events occur with low probability.
- The shape is asymmetric, unlike a Gaussian distribution.
- The rug plot indicates that there are data points on the right side as well, but they are not noticeable in the histogram
Count vs. Density¶
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#--- FA Commits data ---
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
commit_total = df['commit_total'].to_numpy()
npts = len(commit_total)
x = commit_total
bins = np.arange(0, commit_total.max() + 50, 50)
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
#--- histogram (count, density=False) ---
axs[0].hist(x, bins=bins, density=False, color="lightblue", edgecolor="black")
axs[0].set_xlabel("total commits")
axs[0].set_ylabel("count per bin")
axs[0].set_title("Histogram (Count)")
#--- histogram (probability density) ---
axs[1].hist(x, bins=bins, density=True, color="lightblue", edgecolor="black")
#axs[1].plot(x, np.zeros_like(x), '|', ms=npts/20, color="black") # rug plot
axs[1].plot(x, np.zeros_like(x), '|', ms=npts/20, ) # rug plot
axs[1].set_xlabel("total commits")
axs[1].set_ylabel("probability density")
axs[1].set_title("Histogram (Density)")
plt.tight_layout()
plt.show()
Observations:
Count (left):
- Makes it easy to understand the scale or volume of the data.
- However, the values cannot be compared directly if the total amount of data changes.
Density (right):
- The vertical axis is normalized to a probability density (the total area equals 1).
- It does not depend on the number of data points.
- Suitable when you want to compare the shape of the distribution as a probability.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#--- FA Commits data ---
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
week_cols = [c for c in df.columns if "week_" in c]
weekly_mean = df[week_cols].mean(axis=0).to_numpy()
plt.figure(figsize=(12,5))
plt.plot(weekly_mean, marker='o')
plt.xticks(
ticks=np.arange(len(week_cols)),
labels=[c.replace("week_","W") for c in week_cols],
rotation=90
)
plt.ylabel("average commits per student")
plt.title("Weekly Average Commits (Fab Academy)")
plt.grid(True, alpha=0.3)
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#--- FA Commits data ---
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
week_cols = [c for c in df.columns if "week_" in c]
weeks = df[week_cols].to_numpy()
# --- variance and std for each week
weekly_var = np.var(weeks, axis=0)
weekly_std = np.std(weeks, axis=0)
labels = [c.replace("week_", "W") for c in week_cols]
plt.figure(figsize=(12,5))
plt.plot(weekly_var, marker='o', label="variance")
plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=90)
plt.ylabel("variance")
plt.title("Weekly Variance of Commits (Fab Academy)")
plt.grid(alpha=0.3)
plt.show()
plt.figure(figsize=(12,5))
plt.plot(weekly_std, marker='o', label="std deviation", color='orange')
plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=90)
plt.ylabel("standard deviation")
plt.title("Weekly Standard Deviation of Commits")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
fig, ax1 = plt.subplots()
ax1.plot(weeks.mean(axis=0), marker='o', label="mean", color='blue')
ax1.set_ylabel("mean")
ax1.set_xticks(np.arange(len(labels)))
ax1.set_xticklabels(labels, rotation=90)
ax2 = ax1.twinx()
ax2.plot(weekly_std, marker='o', label="std", color='red')
ax2.set_ylabel("std dev")
plt.title("Weekly Mean & Standard Deviation")
plt.show()
Multidimensional distributions @ Class ¶
- variance
$\sigma_x^2 = \langle (x-\mu_x)^2 \rangle$
- covariance matrix
$C_{ij} = \langle (x_i-\mu_i)(x_j-\mu_j)\rangle$
- numpy.cov: Estimate a covariance matrix, given data and weights.
- numpy.linalg.eig: Compute the eigenvalues and right eigenvectors of a square array. 正方配列の固有値と右固有ベクトルを計算
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(10)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
npts = 2000
#
# generate variance samples
#
mean = [1,1]
stddev = [0.75,2]
varsamples = np.random.normal(mean,stddev,(npts,2))
print(varsamples.shape)
#
# find mean and variance
#
varmean = np.mean(varsamples,axis=0)
varstd = np.sqrt(np.var(varsamples,axis=0))
print(varmean.shape, varmean)
print(varstd.shape, varstd)
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
print(varplotx)
#
# generate covariance samples
#
mean = [7,5]
covariance = [[2.5,-2.1],[-2.1,2.5]]
covarsamples = np.random.multivariate_normal(mean,covariance,npts)
#
# find mean, covariance, eigenvalues, and eigenvectors
#
covarmean = np.mean(covarsamples,axis=0)
covar = np.cov(covarsamples,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
print("eigenvalues:", evalu)
print("eigenvectors:\n", evect)
print("shape:", evect.shape)
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
#
# plot and print
#
print("covariance matrix:")
print(covar)
samples = np.vstack((varsamples,covarsamples))
plt.figure()
plt.hist2d(samples[:,0],samples[:,1],bins=30,cmap='binary')
plt.plot(samples[:,0],samples[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(varmean[0],varmean[1],'ro')
plt.plot(covarmean[0],covarmean[1],'ro')
plt.plot(varplotx,varploty,'r')
plt.plot(covarplotx,covarploty,'r')
plt.text(2.5,-4,"variance",fontsize=15)
plt.text(-1.25,8.5,"covariance",fontsize=15)
plt.axis('off')
plt.show()
#
# print covariance matrices
#
print("covariance matrix:")
varcovar = np.cov(varsamples,rowvar=False)
print(varcovar)
(2000, 2) (2,) [0.99 0.93] (2,) [0.75 1.94] [np.float64(0.2314822201644292), np.float64(1.7412817449912708), None, np.float64(0.9863819825778499), np.float64(0.9863819825778499)] eigenvalues: [0.39 4.61] eigenvectors: [[-0.71 0.7 ] [-0.7 -0.71]] shape: (2, 2) covariance matrix: [[ 2.46 -2.11] [-2.11 2.53]]
covariance matrix: [[ 0.57 -0.01] [-0.01 3.77]]
Multidimensional distributions of Total Git Commis and Active Weeks (Fab Academy) ¶
Examine the covariance between the total number of Git commits and the number of active weeks (weeks with at least one commit).
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
week_cols = [c for c in df.columns if "week_" in c]
total_commits = df["commit_total"].to_numpy()
active_weeks = (df[week_cols] > 0).sum(axis=1).to_numpy()
samples = np.column_stack((total_commits, active_weeks))
print(samples.shape)
# find mean and variance
#
varmean = np.mean(samples,axis=0)
varstd = np.sqrt(np.var(samples,axis=0))
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
covarmean = np.mean(samples,axis=0)
covar = np.cov(samples,rowvar=False)
evalu, evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
print("covariance matrix:")
print(covar)
plt.figure(figsize=(8,7))
plt.hist2d(samples[:,0], samples[:,1], bins=30, cmap='binary')
plt.plot(samples[:,0], samples[:,1], 'o', markersize=2, alpha=0.25)
# plot mean
plt.plot(varmean[0], varmean[1], 'ro')
# axis-aligned variance ellipse
plt.plot(varplotx, varploty, 'g', label="variance (axis-aligned)")
# covariance (rotated ellipse axes)
plt.plot(covarplotx, covarploty, 'r', label="covariance (principal axes)")
plt.xlabel("total commits")
plt.ylabel("active weeks")
plt.title("FabAcademy: variance & covariance")
plt.legend()
plt.show()
(1509, 2) covariance matrix: [[49296.75253607 629.70822501] [ 629.70822501 53.7119388 ]]
plt.figure(figsize=(8,7))
ax = plt.gca()
ax.set_facecolor("black")
plt.gcf().patch.set_facecolor("black")
plt.hist2d(samples[:,0], samples[:,1],
bins=30,
cmap="magma",
alpha=0.6)
plt.plot(samples[:,0], samples[:,1],
'o', color="#7FFFD4",
markersize=3, alpha=0.35)
plt.plot(varmean[0], varmean[1],
'o', color="#FFD580", markersize=10, label="mean")
plt.plot(varplotx, varploty,
color="#BBBBBB", linewidth=2,
label="variance (axis-aligned)")
plt.plot(covarplotx, covarploty,
color="#A0C8FF", linewidth=2.2,
label="covariance (principal axes)")
plt.xlabel("total commits", color="white")
plt.ylabel("active weeks", color="white")
plt.title("FabAcademy: variance & covariance", color="white")
plt.tick_params(colors="white")
legend = plt.legend()
plt.setp(legend.get_texts(), color="white")
plt.tight_layout()
plt.show()
Observations:
covariance matrix:
[[49296.75253607 629.70822501]
[ 629.70822501 53.7119388 ]]
Diagonal (variance):
- The histogram confirmed that the total number of commits follows a long-tail distribution, and this is reflected in its very large variance of 49,296.
- In contrast, the variance of active weeks is only 53, indicating that the spread of active weeks is much smaller compared to the spread in total commits.
Off-diagonal terms (covariance):
- The covariance between total commits and active weeks is 629, suggesting a positive tendency: students who remain active for more weeks generally produce more commits.
- This trend is also visible in the orientation of the red axis.
Final Presentationのために Trends Dataを集めたのでそれを使って
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df0 = pd.read_csv("datasets/Trends/k1_2019_2025_multiTimeline.csv", skiprows=2)
df1 = pd.read_csv("datasets/Trends/k2_2019_2025_multiTimeline.csv", skiprows=2)
label0 = df0.columns[1].split(":")[0].strip() # e.g. "Fab Academy"
label1 = df1.columns[1].split(":")[0].strip() # e.g. "FabLab"
x0 = df0.iloc[:, 1].to_numpy(dtype=float)
x1 = df1.iloc[:, 1].to_numpy(dtype=float)
samples = np.column_stack((x0, x1))
print(samples.shape)
# find mean and variance
#
varmean = np.mean(samples,axis=0)
varstd = np.sqrt(np.var(samples,axis=0))
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
covarmean = np.mean(samples,axis=0)
covar = np.cov(samples,rowvar=False)
evalu, evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
print("covariance matrix:")
print(covar)
plt.figure(figsize=(8,7))
plt.hist2d(samples[:,0], samples[:,1], bins=30, cmap='binary')
plt.plot(samples[:,0], samples[:,1], 'o', markersize=2, alpha=0.25)
# plot mean
plt.plot(varmean[0], varmean[1], 'ro')
# axis-aligned variance ellipse
plt.plot(varplotx, varploty, 'g', label="variance (axis-aligned)")
# covariance (rotated ellipse axes)
plt.plot(covarplotx, covarploty, 'r', label="covariance (principal axes)")
plt.xlabel(label0)
plt.ylabel(label1)
plt.title("FabAcademy: variance & covariance")
plt.legend()
plt.show()
(84, 2) covariance matrix: [[233.40605278 88.32085485] [ 88.32085485 212.34581182]]
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df0 = pd.read_csv("datasets/Trends/k1_2019_2025_multiTimeline.csv", skiprows=2)
df1 = pd.read_csv("datasets/Trends/k2_2019_2025_multiTimeline.csv", skiprows=2)
CSV_FILES = [
"datasets/Trends/k1_2019_2025_multiTimeline.csv", # df0 = 基準
"datasets/Trends/k2_2019_2025_multiTimeline.csv",
"datasets/Trends/k3_2019_2025_multiTimeline.csv",
"datasets/Trends/k4_2019_2025_multiTimeline.csv",
"datasets/Trends/k5_2019_2025_multiTimeline.csv",
"datasets/Trends/k6_2019_2025_multiTimeline.csv",
"datasets/Trends/k7_2019_2025_multiTimeline.csv"
]
def load_trends_csv(path):
df = pd.read_csv(path, skiprows=2)
label = df.columns[1].split(":")[0].strip()
values = df.iloc[:,1].astype(float).to_numpy()
months = df.iloc[:,0].to_numpy()
return label, months, values
def plot_covariance(x, y, label0, label1, ax):
samples = np.column_stack((x, y))
varmean = np.mean(samples,axis=0)
varstd = np.sqrt(np.var(samples,axis=0))
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
covarmean = np.mean(samples,axis=0)
covar = np.cov(samples,rowvar=False)
evalu, evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
ax.hist2d(samples[:,0], samples[:,1], bins=30, cmap='binary')
ax.plot(samples[:,0], samples[:,1], 'o', markersize=2, alpha=0.25)
# plot mean
ax.plot(varmean[0], varmean[1], 'ro')
# axis-aligned variance ellipse
ax.plot(varplotx, varploty, 'g', label="variance (axis-aligned)")
# covariance (rotated ellipse axes)
ax.plot(covarplotx, covarploty, 'r', label="covariance (principal axes)")
ax.set_xlabel(label0)
ax.set_ylabel(label1)
return covar
labels = []
months_list = []
values_list = []
for file in CSV_FILES:
label, months, values = load_trends_csv(file)
labels.append(label)
months_list.append(months)
values_list.append(values)
fig, axes = plt.subplots(2,3, figsize=(18,10))
axes = axes.flatten()
for i in range(1, len(CSV_FILES)):
ax = axes[i-1]
cov = plot_covariance(values_list[0], values_list[i], labels[0], labels[i], ax)
cov_text = f"covariance:\n[{cov[0,0]:.2f}, {cov[0,1]:.2f}]\n[{cov[1,0]:.2f}, {cov[1,1]:.2f}]"
ax.text(0.5, -0.15, cov_text,
transform=ax.transAxes,
ha='center', va='top',
fontsize=9)
plt.tight_layout()
plt.show()
Entropy of Total Git Commits (Fab Academy)¶
$H = \sum_i p(x_i)\log_2 p(x_i)$
measures the information in a sample
-
- When calculated using log₂, the units are bits.
- not use density=True in the histogram, because it converts the values into a probability density.
Reference:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#--- FA Commits data ---
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
commit_total = df['commit_total'].to_numpy() # 生徒ごとのコミット数合計
npts = len(commit_total) # num of students
x = commit_total
#-- entropy --
hist=np.histogram(x,bins=bins, density=False)
data = hist[0]
dist = data/data.sum()
def entropy(dist):
positives = dist[np.where(dist > 0)]
return -np.sum(positives * np.log2(positives))
H = entropy(dist)
print(f"entropy = {H:.2f} bits")
#--- plot histogram
bins = np.arange(0, commit_total.max() + 50, 50)
plt.hist(x, bins, density=False) # density = False Counts data
plt.xlabel("total commits")
plt.ylabel("counts")
plt.title(f"Fab Academy total commits entropy: {H:.1f} bits")
plt.show()
entropy = 3.34 bits
Mutual Information of Total Git Commis and Active Weeks (Fab Academy) ¶
- $H(x)+H(y)-H(x,y) = \sum_i p(x_i)\log_2 p(x_i) + \sum_j p(y_j)\log_2 p(y_j) - \sum_i \sum_j p(x_i,y_j)\log_2 p(x_i,y_j)$
- equal to the difference in the information between the samples separately and together
- measures nonlinear as well as linear relationships
- without priors the data dependence for estimation goes up exponentially with dimension
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# --- load data ---
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
week_cols = [c for c in df.columns if "week_" in c]
x = df["commit_total"].to_numpy()
y = (df[week_cols] > 0).sum(axis=1).to_numpy()
nbins = 256
def entropy(dist):
index = np.where(dist > 0) # 0 log(0) = 0
positives = dist[index]
return -np.sum(positives*np.log2(positives))
def entropy2(dist):
indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
positives = dist[indexx,indexy]
return -np.sum(positives*np.log2(positives))
def information(x,y):
xhist,xedges = np.histogram(x,nbins)
xdist = xhist/np.sum(xhist)
yhist,yedges = np.histogram(y,nbins)
ydist = yhist/np.sum(yhist)
xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
xydist = xyhist/np.sum(xyhist)
Hx = entropy(xdist)
Hy = entropy(ydist)
Hxy = entropy2(xydist)
return Hx+Hy-Hxy
# -- covariance --
covar = np.cov(np.c_[x,y],rowvar=False)
print(f"{npts:.0e} points")
print(f"linear covariance:\n{covar}")
# --- mutual information ---
I = information(x, y)
print(f"Mutual Information : {I:.3f} bits")
# --- optional scatter plot ---
plt.figure(figsize=(6,5))
plt.plot(x, y, 'o')
plt.xlabel("total commits")
plt.ylabel("active weeks")
plt.title(f"Scatter plot\nMI = {I:.1f} bits")
plt.show()
2e+03 points linear covariance: [[49296.75253607 629.70822501] [ 629.70822501 53.7119388 ]] Mutual Information : 1.326 bits
MI and Trends Data¶
For the final presentation, I collected several Trends datasets and visualized their mutual information.
The Trends data were obtained from Google Trends, one keyword at a time, with the following settings:
a) Period: 2019–2025, Region: Worldwide, Keyword: Fab Academy b) Period: 2019–2025, Region: Worldwide, Keyword: Fablab c) Period: 2019–2025, Region: Worldwide, Keyword: 3D Printer d) Period: 2019–2025, Region: Worldwide, Keyword: Data Science e) Period: 2019–2025, Region: Worldwide, Keyword: SDGs f) Period: 2019–2025, Region: Worldwide, Keyword: Maker Faire
Using Fab Academy (a) as the common reference series, I computed the Mutual Information for the following pairs:
Fab Academy × Fablab
Fab Academy × 3D Printer
Fab Academy × Data Science
Fab Academy × SDGs
Fab Academy × Maker Faire
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
CSV_FILES = [
"datasets/Trends/k1_2019_2025_multiTimeline.csv",
"datasets/Trends/k2_2019_2025_multiTimeline.csv",
"datasets/Trends/k3_2019_2025_multiTimeline.csv",
"datasets/Trends/k4_2019_2025_multiTimeline.csv",
"datasets/Trends/k5_2019_2025_multiTimeline.csv",
"datasets/Trends/k6_2019_2025_multiTimeline.csv",
"datasets/Trends/k7_2019_2025_multiTimeline.csv"
]
OUTPUT_JS = "datasets/Trends/trendData.js"
def load_trends_csv(path):
df = pd.read_csv(path, skiprows=2)
label = df.columns[1].split(":")[0].strip()
values = df.iloc[:, 1].astype(float).to_numpy()
months = df.iloc[:, 0].to_numpy()
return label, months, values
def entropy_1d(dist):
idx = np.where(dist > 0)
p = dist[idx]
return -np.sum(p * np.log2(p))
def entropy_2d(dist):
idx = np.where(dist > 0)
p = dist[idx]
return -np.sum(p * np.log2(p))
def mutual_information(x, y, nbins=256):
xhist, _ = np.histogram(x, bins=nbins)
yhist, _ = np.histogram(y, bins=nbins)
xyhist, _, _ = np.histogram2d(x, y, bins=[nbins, nbins])
xdist = xhist / np.sum(xhist)
ydist = yhist / np.sum(yhist)
xydist = xyhist / np.sum(xyhist)
Hx = entropy_1d(xdist)
Hy = entropy_1d(ydist)
Hxy = entropy_2d(xydist)
return Hx + Hy - Hxy
def yearly_average(months, values):
df = pd.DataFrame({"month": months, "value": values})
df["year"] = df["month"].str.slice(0, 4).astype(int)
return df.groupby("year")["value"].mean().round(2).to_dict()
def plot_covariance(x, y, label0, label1, ax):
ax.scatter(x, y, s=8, alpha=0.25)
MI = mutual_information(x, y)
ax.set_title(f"{label0} vs {label1}\nMI = {MI:.3f} bits")
ax.set_xlabel(label0)
ax.set_ylabel(label1)
labels = []
months_all = []
values_all = []
for file in CSV_FILES:
label, months, values = load_trends_csv(file)
labels.append(label)
months_all.append(months)
values_all.append(values)
base_label = labels[0]
base_values = values_all[0]
MI_values = {}
fig, axes = plt.subplots(1, len(CSV_FILES)-1, figsize=(5*(len(CSV_FILES)-1), 5))
for i in range(1, len(CSV_FILES)):
x = base_values
y = values_all[i]
MI = mutual_information(x, y)
MI_values[labels[i]] = round(float(MI), 5)
plot_covariance(x, y, base_label, labels[i], axes[i-1])
plt.tight_layout()
plt.show()
trend_js = {}
for label, months, values in zip(labels, months_all, values_all):
clean_label = label.replace(" ", "_")
trend_js[clean_label] = {}
trend_js[clean_label]["years"] = yearly_average(months, values)
if label in MI_values:
trend_js[clean_label]["MI_with_" + base_label.replace(" ", "_")] = MI_values[label]
with open(OUTPUT_JS, "w", encoding="utf-8") as f:
f.write("export const trendData = ")
json.dump(trend_js, f, indent=2)
f.write(";")
print("Saved:", OUTPUT_JS)
Saved: datasets/Trends/trendData.js
i = 3
x = base_values
y = values_all[i]
label_y = labels[i]
MI = mutual_information(x, y)
MI_values[label_y] = round(float(MI), 5)
fig, ax = plt.subplots(figsize=(6, 5))
fig.patch.set_facecolor("black")
ax.set_facecolor("black")
ax.scatter(x, y, color="#7FFFD4", s=12, alpha=0.85)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
#ax.plot(x, p(x), color="#DDDDDD", linewidth=2)
ax.set_xlabel(base_label, color="white")
ax.set_ylabel(label_y, color="white")
ax.set_title(f"{label_y} vs {base_label}\nMI = {MI_values[label_y]}", color="white")
ax.tick_params(colors="white")
for spine in ax.spines.values():
spine.set_color("white")
plt.tight_layout()
plt.show()