# Class sample code

import numpy as np
import matplotlib.pyplot as plt

# Set the random seed so results are reproducible.
# 結果を再現できるように乱数の種を設定する。
np.random.seed(10)

# Number of points.
# データ点数（サンプル数）を 1000 に設定する。
npts = 1000

# Gaussian parameters (mean & stddev)
# ガウス分布（正規分布）の平均値と標準偏差
mean = 1    #ガウシアンの平均値
stddev = 2  #ガウシアンの標準偏差(standard deviation)

# Generate Gaussian random samples
# ガウシアン分布に従う乱数を npts 個生成する
x = np.random.normal(mean,stddev,npts)

# Plot a histogram of x.
# bins=npts//50 : divide the data into (npts/50) bins.
# 数値の範囲（最小値~最大値）を20(=1000/50)分割した区分をつくる
#  //50の意味: 結果は小数点切り捨てられ整数に
# density=True : normalize the histogram so area = 1.
# density=True : 面積が 1 になるように正規化する。
plt.hist(x,bins=npts//50,density=True)
#plt.plot(x,0*x,'|',ms=npts/20)

# Plot the theoretical Gaussian curve
# 理論的なガウス分布の曲線を計算してプロットする
# xi: x軸の値を mean - 3σ 〜 mean + 3σ まで100点生成
xi = np.linspace(mean-3*stddev,mean+3*stddev,100)

# yi: 正規分布の確率密度関数 (PDF)
# yi = exp(-(x-mean)^2 / (2σ^2)) / sqrt(2πσ^2)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)

# Plot the Gaussian curve in red
# 赤色でガウス曲線を描画
plt.plot(xi,yi,'r')

plt.show()

# Class sample code -> Histogram

import numpy as np
import matplotlib.pyplot as plt

# Set the random seed so results are reproducible.
# 結果を再現できるように乱数の種を設定する。
np.random.seed(10)

# Number of points.
# データ点数（サンプル数）を 1000 に設定する。
npts = 1000

# Gaussian parameters (mean & stddev)
# ガウス分布（正規分布）の平均値と標準偏差
mean = 1    #ガウシアンの平均値
stddev = 2  #ガウシアンの標準偏差(standard deviation)

# Generate Gaussian random samples
# ガウシアン分布に従う乱数を npts 個生成する
x = np.random.normal(mean,stddev,npts)

# Plot a histogram of x.
# bins=npts//50 : divide the data into (npts/50) bins.
# 数値の範囲（最小値~最大値）を20(=1000/50)分割した区分をつくる
#  //50の意味: 結果は小数点切り捨てられ整数に
# density=True : normalize the histogram so area = 1.
# density=True : 面積が 1 になるように正規化する。
plt.hist(x,bins=npts//50,density=True)
#plt.plot(x,0*x,'|',ms=npts/20)

plt.show()

# 1. Make a CSV file

import pandas as pd
import numpy as np

# Create 100 random data
np.random.seed(10)              # 乱数の種（再現性のため）
data = np.random.randn(1000) # 正規分布に従う乱数を100個生成
# Create DataFrame
df = pd.DataFrame({"value": data})

# Save as CSV
df.to_csv("data/day5_random_data.csv", index=False)

# 2. Read the CSV file
import pandas as pd
import matplotlib.pyplot as plt

# Read CSV file
df = pd.read_csv("data/day5_random_data.csv")

# Plot histogram
plt.hist(df["value"], bins=30, density=True, alpha=0.6)
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.title("Histogram of Random Data")
plt.show()

#3. Create a Gaussian from Histogram

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import pandas as pd

# Read CSV file
df = pd.read_csv("data/day5_random_data.csv")
data = df["value"]

# ==== Gaussian Fit ====
mu, sigma = norm.fit(data)  # 平均 mu と 標準偏差 sigma を推定

# ==== Plot histogram ====
plt.hist(data, bins=30, density=True, alpha=0.6, label="Histogram")

# ==== Plot Gaussian curve ====
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 200)
p = norm.pdf(x, mu, sigma)

plt.plot(x, p, 'r-', linewidth=2, label=f"Gaussian Fit\nμ={mu:.2f}, σ={sigma:.2f}")

plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Histogram + Gaussian Fit")
plt.legend()
plt.show()

import pandas as pd
import numpy as np

# Input CSV
input_file = "data/day5_height_17_boy.csv"

# Output CSV
output_file = "data/day5_height_17_boy_rawdata.csv"

# Read the CSV file
df = pd.read_csv(input_file)

# Normalize the frequency values (make the total equal to 1)
# frequency を正規化（合計を1にする）
df["frequency"] = df["frequency"] / df["frequency"].sum()

# Number of samples to generate (e.g., 1000)
N = 1000

# Generate raw data
samples = np.random.choice(
    df["height"],
    size=N,
    p=df["frequency"]
)

# Convert to a pandas DataFrame
df_raw = pd.DataFrame({"height": samples})

# Save to CSV
df_raw.to_csv(output_file, index=False)

print("Saved:", output_file)
print(df_raw.head())

Saved: data/day5_height_17_boy_rawdata.csv
   height
0     181
1     178
2     173
3     165
4     180

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import pandas as pd

# Read CSV file
df = pd.read_csv("data/day5_height_17_boy_rawdata.csv")
data = df["height"]

# ==== Gaussian Fit ====
mu, sigma = norm.fit(data)  # 平均 mu と 標準偏差 sigma を推定

# ==== Plot histogram ====
plt.hist(data, bins=30, density=True, alpha=0.6, label="Histogram")

# ==== Plot Gaussian curve ====
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 200)
p = norm.pdf(x, mu, sigma)

plt.plot(x, p, 'r-', linewidth=2, label=f"Gaussian Fit\nμ={mu:.2f}, σ={sigma:.2f}")

plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Histogram + Gaussian Fit")
plt.legend()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.stats import norm

# 1. Load Data and Cleanup
df = pd.read_csv('data/day5_height17.csv', dtype={'Frequency (‰)': str})

# Clean up and convert 'Frequency (‰)' column to float
df['Frequency (‰)'] = pd.to_numeric(
    df['Frequency (‰)'].astype(str).str.replace(' ', '').str.replace('-', '0'),
    errors='coerce'
)

# Remove NaN rows
df = df.dropna(subset=['Frequency (‰)'])

# 2. Define Gaussian Distribution Function
def gaussian(x, mean, std_dev, scale):
    """Gaussian (Normal) Distribution PDF"""
    return scale * norm.pdf(x, loc=mean, scale=std_dev)

# 3. Parameter Initialization and Optimization (Curve Fitting)
X = df['Height (cm)'].values
Y = df['Frequency (‰)'].values

# Initial estimation (Weighted mean and std dev)
mean_init = np.sum(X * Y) / np.sum(Y)
std_dev_init = np.sqrt(np.sum(Y * (X - mean_init)**2) / np.sum(Y))
scale_init = np.sum(Y) 

initial_params = [mean_init, std_dev_init, scale_init]

# Perform Optimization
try:
    params, covariance = curve_fit(gaussian, X, Y, p0=initial_params, bounds=([0, 0, 0], [np.inf, np.inf, np.inf]))
    mean_fit, std_dev_fit, scale_fit = params
except RuntimeError:
    print("Optimization failed. Using initial estimates.")
    mean_fit, std_dev_fit = mean_init, std_dev_init
    scale_fit = scale_init
    
# 4. Plotting
plt.figure(figsize=(10, 6))

# --- Plot Histogram (Bar Chart) ---
plt.bar(X, Y, width=0.9, color='skyblue', alpha=0.7, label='Height Distribution (Male, 17 Years Old)') 

# --- Plot Gaussian Fit Curve ---
x_fit = np.linspace(X.min(), X.max(), 500)
y_fit = gaussian(x_fit, mean_fit, std_dev_fit, scale_fit)

plt.plot(x_fit, y_fit, color='red', linewidth=2.5, label='Gaussian Fit')

# 5. Plot Aesthetics
plt.title(f'Height Distribution (Male, 17 Years Old) and Gaussian Fit', fontsize=16)
plt.xlabel('Height (cm)', fontsize=14)
plt.ylabel('Frequency (‰)', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Display fitted parameters (SyntaxWarningを回避するためr-stringを使用)
param_text = (
    r'Estimated Mean ($\mu$): ' + f'{mean_fit:.2f} cm\n'
    r'Estimated Std Dev ($\sigma$): ' + f'{std_dev_fit:.2f} cm'
)

plt.text(0.05, 0.95, param_text, transform=plt.gca().transAxes,
         verticalalignment='top', bbox=dict(boxstyle="round,pad=0.5", fc="white", alpha=0.6))

plt.legend()
plt.tight_layout()
plt.show()

# 6. Output Results
print("--- Fit Results ---")
print(f"Mean (μ): {mean_fit:.2f} cm")
print(f"Standard Deviation (σ): {std_dev_fit:.2f} cm")

--- Fit Results ---
Mean (μ): 165.46 cm
Standard Deviation (σ): 5.15 cm

5.Probability¶

Assignment (1/2)¶

Set up template notebooks and slides¶

Assignment (2/2)¶

Words¶

Probability (確率)¶

Gaussian (ガウス分布（正規分布）)¶

Histgram (度数分布図)¶

Understanding of Class sample code¶

What I do here is:　　¶

1. Make a CSV file¶

2. Read the CSV file¶

3. Create a Gaussian from Histogram¶

4. Use another CSV file¶

4-1. Probabilistic Generation of Raw Height Data from Height Frequency Distribution¶

4-2. Gaussian Fit Using Pseudo Individual-Level Data¶

4-3. Curve Fitting of a Gaussian Model to Aggregated Height Data¶

5.Probability¶

Assignment (1/2)¶

Set up template notebooks and slides¶

Assignment (2/2)¶

Words¶

Probability (確率)¶

Gaussian (ガウス分布（正規分布）)¶

Histgram (度数分布図)¶

Understanding of Class sample code¶

What I do here is: ¶

1. Make a CSV file¶

2. Read the CSV file¶

3. Create a Gaussian from Histogram¶

4. Use another CSV file¶

4-1. Probabilistic Generation of Raw Height Data from Height Frequency Distribution¶

4-2. Gaussian Fit Using Pseudo Individual-Level Data¶

4-3. Curve Fitting of a Gaussian Model to Aggregated Height Data¶

What I do here is:　　¶