Data Science - Week 3: Probability Assignment¶
Student: Sedat Yalcin
Dataset: Istanbul Traffic Index (2015-2024) - traffic_index.csv
1. Load and Explore Data¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
import warnings
warnings.filterwarnings('ignore')
# Load traffic data
df = pd.read_csv('datasets/traffic_index.csv')
df['trafficindexdate'] = pd.to_datetime(df['trafficindexdate'])
df = df.sort_values('trafficindexdate').reset_index(drop=True)
print(f"Dataset: {len(df)} observations")
print(f"Date range: {df['trafficindexdate'].min().date()} to {df['trafficindexdate'].max().date()}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)
print(f"\nBasic statistics:")
print(df[['minimum_traffic_index', 'maximum_traffic_index', 'average_traffic_index']].describe())
Dataset: 3332 observations
Date range: 2015-08-06 to 2024-10-18
Columns: ['trafficindexdate', 'minimum_traffic_index', 'maximum_traffic_index', 'average_traffic_index']
First few rows:
trafficindexdate minimum_traffic_index maximum_traffic_index \
0 2015-08-06 00:00:00+00:00 24 63
1 2015-08-07 00:00:00+00:00 2 49
2 2015-08-11 00:00:00+00:00 11 62
3 2015-08-12 00:00:00+00:00 1 63
4 2015-08-13 00:00:00+00:00 2 56
average_traffic_index
0 57.858116
1 23.770492
2 38.601266
3 29.715278
4 28.557491
Data types:
trafficindexdate datetime64[ns, UTC]
minimum_traffic_index int64
maximum_traffic_index int64
average_traffic_index float64
dtype: object
Basic statistics:
minimum_traffic_index maximum_traffic_index average_traffic_index
count 3332.000000 3332.000000 3332.000000
mean 2.075630 59.956483 27.830846
std 2.751651 15.613316 8.252579
min 1.000000 4.000000 1.083916
25% 1.000000 53.000000 23.651568
50% 1.000000 63.000000 28.979524
75% 2.000000 71.000000 33.491313
max 58.000000 90.000000 59.428571
2. Histogram and Density Estimation¶
A histogram counts points in bins. We'll visualize the distribution of average traffic index values.
# Extract average traffic index
x = df['average_traffic_index'].values
npts = len(x)
# Create histogram
plt.figure(figsize=(12, 5))
# Plot 1: Histogram with sample of points (too many points to show all)
plt.subplot(1, 2, 1)
n_bins = npts // 50 # Reasonable number of bins
counts, bins, patches = plt.hist(x, bins=n_bins, density=True, alpha=0.7, color='skyblue', edgecolor='black')
# Show only a sample of data points to avoid overcrowding
sample_indices = np.random.choice(len(x), size=min(200, len(x)), replace=False)
plt.plot(x[sample_indices], np.zeros_like(x[sample_indices]), '|', ms=8, color='red', alpha=0.5, label='sample data points')
plt.xlabel('Average Traffic Index')
plt.ylabel('Density')
plt.title('Histogram of Average Traffic Index')
plt.legend()
plt.grid(True, alpha=0.3)
# Plot 2: Histogram with different bin sizes
plt.subplot(1, 2, 2)
for bins_count in [20, 50, 100]:
plt.hist(x, bins=bins_count, density=True, alpha=0.5, label=f'{bins_count} bins')
plt.xlabel('Average Traffic Index')
plt.ylabel('Density')
plt.title('Histogram with Different Bin Sizes')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Number of data points: {npts}")
print(f"Bin size used: {n_bins}")
print(f"Note: Showing sample of {min(200, len(x))} data points to avoid overcrowding")
Number of data points: 3332 Bin size used: 66 Note: Showing sample of 200 data points to avoid overcrowding
3. Statistical Measures¶
Calculate mean, variance, standard deviation, and other statistical properties.
# Calculate statistics
mean_x = np.mean(x)
variance_x = np.var(x, ddof=0) # Population variance
variance_x_sample = np.var(x, ddof=1) # Sample variance
stddev_x = np.std(x, ddof=0) # Population standard deviation
stddev_x_sample = np.std(x, ddof=1) # Sample standard deviation
# Using expectation notation: <f(x)> = (1/N) * sum(f(x_i))
expectation_x = np.mean(x) # <x>
expectation_x_squared = np.mean(x**2) # <x²>
# Note: variance = <x²> - <x>²
variance_from_expectation = expectation_x_squared - expectation_x**2
print("=" * 60)
print("STATISTICAL MEASURES")
print("=" * 60)
print(f"Mean (μ): {mean_x:.4f}")
print(f"Variance (σ², population): {variance_x:.4f}")
print(f"Variance (σ², sample): {variance_x_sample:.4f}")
print(f"Standard Deviation (σ, pop): {stddev_x:.4f}")
print(f"Standard Deviation (σ, samp): {stddev_x_sample:.4f}")
print(f"\nExpectation <x>: {expectation_x:.4f}")
print(f"Expectation <x²>: {expectation_x_squared:.4f}")
print(f"Variance from <x²> - <x>²: {variance_from_expectation:.4f} (should match above)")
print(f"\nMin value: {np.min(x):.4f}")
print(f"Max value: {np.max(x):.4f}")
print(f"Range: {np.max(x) - np.min(x):.4f}")
print(f"Median: {np.median(x):.4f}")
print(f"25th percentile: {np.percentile(x, 25):.4f}")
print(f"75th percentile: {np.percentile(x, 75):.4f}")
print("=" * 60)
============================================================ STATISTICAL MEASURES ============================================================ Mean (μ): 27.8308 Variance (σ², population): 68.0846 Variance (σ², sample): 68.1051 Standard Deviation (σ, pop): 8.2513 Standard Deviation (σ, samp): 8.2526 Expectation <x>: 27.8308 Expectation <x²>: 842.6406 Variance from <x²> - <x>²: 68.0846 (should match above) Min value: 1.0839 Max value: 59.4286 Range: 58.3447 Median: 28.9795 25th percentile: 23.6516 75th percentile: 33.4913 ============================================================
4. Gaussian (Normal) Distribution Fit¶
Test if the data follows a Gaussian distribution by fitting a normal distribution and comparing.
# Fit Gaussian distribution
mu_fit, sigma_fit = norm.fit(x)
# Generate Gaussian curve
x_range = np.linspace(np.min(x), np.max(x), 100)
gaussian_curve = norm.pdf(x_range, mu_fit, sigma_fit)
# Ensure required variables exist (in case cells are run out of order)
if 'mean_x' not in globals() or 'stddev_x' not in globals():
mean_x = np.mean(x)
stddev_x = np.std(x, ddof=0)
if 'npts' not in globals():
npts = len(x)
# Plot histogram with Gaussian fit
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(x, bins=npts//50, density=True, alpha=0.7, color='skyblue', edgecolor='black', label='Histogram')
plt.plot(x_range, gaussian_curve, 'r-', linewidth=2, label=f'Gaussian fit (μ={mu_fit:.2f}, σ={sigma_fit:.2f})')
plt.xlabel('Average Traffic Index')
plt.ylabel('Density')
plt.title('Histogram with Gaussian Fit')
plt.legend()
plt.grid(True, alpha=0.3)
# Q-Q plot to check normality
# Q-Q plot compares quantiles of data with quantiles of normal distribution
# Points on the line indicate normal distribution
plt.subplot(1, 2, 2)
stats.probplot(x, dist="norm", plot=plt)
plt.title('Q-Q Plot (Normal Distribution)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Fitted Gaussian parameters:")
print(f" Mean (μ): {mu_fit:.4f}")
print(f" Std (σ): {sigma_fit:.4f}")
print(f"\nComparison with data:")
print(f" Data mean: {mean_x:.4f} (difference: {abs(mean_x - mu_fit):.4f})")
print(f" Data std: {stddev_x:.4f} (difference: {abs(stddev_x - sigma_fit):.4f})")
print(f"\nNote: Q-Q plot shows how well data matches normal distribution.")
print(f" Points close to the red line indicate normality.")
Fitted Gaussian parameters:
Mean (μ): 27.8308
Std (σ): 8.2513
Comparison with data:
Data mean: 27.8308 (difference: 0.0000)
Data std: 8.2513 (difference: 0.0000)
Note: Q-Q plot shows how well data matches normal distribution.
Points close to the red line indicate normality.
5. Normality Test¶
Use statistical tests to check if the data is normally distributed.
# Perform normality tests
# Note: Shapiro-Wilk is most reliable for n < 5000
# For larger samples, we use other tests
# Perform Shapiro-Wilk test for normality (best for n < 5000)
if len(x) < 5000:
shapiro_stat, shapiro_p = stats.shapiro(x)
shapiro_valid = True
else:
shapiro_stat, shapiro_p = None, None
shapiro_valid = False
# Perform Kolmogorov-Smirnov test
ks_stat, ks_p = stats.kstest(x, 'norm', args=(mu_fit, sigma_fit))
# Perform D'Agostino's normality test (combines skewness and kurtosis)
dagostino_stat, dagostino_p = stats.normaltest(x)
print("=" * 60)
print("NORMALITY TESTS")
print("=" * 60)
if shapiro_valid:
print(f"Shapiro-Wilk test (best for n < 5000):")
print(f" Statistic: {shapiro_stat:.4f}")
print(f" p-value: {shapiro_p:.4f}")
print(f" Result: {'Normal' if shapiro_p > 0.05 else 'Not normal'} (α=0.05)")
else:
print(f"Shapiro-Wilk test: Not recommended for n ≥ 5000 (n={len(x)})")
print(f"\nKolmogorov-Smirnov test:")
print(f" Statistic: {ks_stat:.4f}")
print(f" p-value: {ks_p:.4f}")
print(f" Result: {'Normal' if ks_p > 0.05 else 'Not normal'} (α=0.05)")
print(f"\nD'Agostino's test (skewness + kurtosis):")
print(f" Statistic: {dagostino_stat:.4f}")
print(f" p-value: {dagostino_p:.4f}")
print(f" Result: {'Normal' if dagostino_p > 0.05 else 'Not normal'} (α=0.05)")
print("=" * 60)
============================================================ NORMALITY TESTS ============================================================ Shapiro-Wilk test (best for n < 5000): Statistic: 0.9656 p-value: 0.0000 Result: Not normal (α=0.05) Kolmogorov-Smirnov test: Statistic: 0.0685 p-value: 0.0000 Result: Not normal (α=0.05) D'Agostino's test (skewness + kurtosis): Statistic: 242.9078 p-value: 0.0000 Result: Not normal (α=0.05) ============================================================
6. Variance and Covariance¶
Variance¶
- Measures the spread of a single variable: $\sigma_x^2 = \langle (x-\mu_x)^2 \rangle$
- Shows how much values deviate from the mean
Covariance¶
- Measures joint dependence between two variables: $\mathrm{Cov}(x,y) = \langle (x-\mu_x)(y-\mu_y) \rangle$
- Positive: variables increase together
- Negative: one increases as the other decreases
- Zero: no linear relationship
Correlation Coefficient¶
- Normalized covariance: $\rho(x,y) = \cfrac{\mathrm{Cov}(x,y)}{\sigma_x \sigma_y}$
- Range: -1 to +1
- Measures strength and direction of linear relationship
# Extract all traffic index columns
min_traffic = df['minimum_traffic_index'].values
max_traffic = df['maximum_traffic_index'].values
avg_traffic = df['average_traffic_index'].values
# Calculate means
mu_min = np.mean(min_traffic)
mu_max = np.mean(max_traffic)
mu_avg = np.mean(avg_traffic)
# Calculate variances (spread of each variable)
var_min = np.var(min_traffic, ddof=0)
var_max = np.var(max_traffic, ddof=0)
var_avg = np.var(avg_traffic, ddof=0)
# Calculate covariance using expectation: Cov(x,y) = <(x-μ_x)(y-μ_y)>
cov_min_max = np.mean((min_traffic - mu_min) * (max_traffic - mu_max))
cov_min_avg = np.mean((min_traffic - mu_min) * (avg_traffic - mu_avg))
cov_max_avg = np.mean((max_traffic - mu_max) * (avg_traffic - mu_avg))
# Using numpy.cov (returns covariance matrix)
cov_matrix = np.cov([min_traffic, max_traffic, avg_traffic])
# cov_matrix[i,j] = covariance between variable i and j
# Diagonal elements are variances
# Calculate standard deviations
sigma_min = np.std(min_traffic)
sigma_max = np.std(max_traffic)
sigma_avg = np.std(avg_traffic)
# Calculate correlation coefficients: ρ(x,y) = Cov(x,y) / (σ_x * σ_y)
rho_min_max = cov_min_max / (sigma_min * sigma_max)
rho_min_avg = cov_min_avg / (sigma_min * sigma_avg)
rho_max_avg = cov_max_avg / (sigma_max * sigma_avg)
print("=" * 60)
print("VARIANCE AND COVARIANCE")
print("=" * 60)
print("Variances (σ²):")
print(f" Minimum: {var_min:.4f}")
print(f" Maximum: {var_max:.4f}")
print(f" Average: {var_avg:.4f}")
print(f"\nCovariances:")
print(f" Cov(min, max): {cov_min_max:.4f}")
print(f" Cov(min, avg): {cov_min_avg:.4f}")
print(f" Cov(max, avg): {cov_max_avg:.4f}")
print(f"\nCorrelation coefficients ρ:")
print(f" ρ(min, max): {rho_min_max:.4f}")
print(f" ρ(min, avg): {rho_min_avg:.4f}")
print(f" ρ(max, avg): {rho_max_avg:.4f}")
print("=" * 60)
# Visualize covariance matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Plot 1: Covariance matrix heatmap
im = axes[0].imshow(cov_matrix, cmap='coolwarm', aspect='auto')
axes[0].set_xticks([0, 1, 2])
axes[0].set_yticks([0, 1, 2])
axes[0].set_xticklabels(['Min', 'Max', 'Avg'])
axes[0].set_yticklabels(['Min', 'Max', 'Avg'])
axes[0].set_title('Covariance Matrix')
for i in range(3):
for j in range(3):
text = axes[0].text(j, i, f'{cov_matrix[i, j]:.1f}',
ha="center", va="center", color="black", fontweight='bold')
plt.colorbar(im, ax=axes[0])
# Plot 2: Correlation matrix heatmap
corr_matrix = np.array([[1.0, rho_min_max, rho_min_avg],
[rho_min_max, 1.0, rho_max_avg],
[rho_min_avg, rho_max_avg, 1.0]])
im2 = axes[1].imshow(corr_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
axes[1].set_xticks([0, 1, 2])
axes[1].set_yticks([0, 1, 2])
axes[1].set_xticklabels(['Min', 'Max', 'Avg'])
axes[1].set_yticklabels(['Min', 'Max', 'Avg'])
axes[1].set_title('Correlation Matrix')
for i in range(3):
for j in range(3):
text = axes[1].text(j, i, f'{corr_matrix[i, j]:.2f}',
ha="center", va="center", color="black", fontweight='bold')
plt.colorbar(im2, ax=axes[1])
plt.tight_layout()
plt.show()
# Visualize correlations
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].scatter(min_traffic, max_traffic, alpha=0.5, s=10)
axes[0].set_xlabel('Minimum Traffic Index')
axes[0].set_ylabel('Maximum Traffic Index')
axes[0].set_title(f'ρ = {rho_min_max:.3f}')
axes[0].grid(True, alpha=0.3)
axes[1].scatter(min_traffic, avg_traffic, alpha=0.5, s=10, color='green')
axes[1].set_xlabel('Minimum Traffic Index')
axes[1].set_ylabel('Average Traffic Index')
axes[1].set_title(f'ρ = {rho_min_avg:.3f}')
axes[1].grid(True, alpha=0.3)
axes[2].scatter(max_traffic, avg_traffic, alpha=0.5, s=10, color='red')
axes[2].set_xlabel('Maximum Traffic Index')
axes[2].set_ylabel('Average Traffic Index')
axes[2].set_title(f'ρ = {rho_max_avg:.3f}')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
============================================================ VARIANCE AND COVARIANCE ============================================================ Variances (σ²): Minimum: 7.5693 Maximum: 243.7025 Average: 68.0846 Covariances: Cov(min, max): 4.3751 Cov(min, avg): 6.6368 Cov(max, avg): 116.3614 Correlation coefficients ρ: ρ(min, max): 0.1019 ρ(min, avg): 0.2924 ρ(max, avg): 0.9033 ============================================================
7. Distribution Characteristics¶
Check for long-tail and multi-modal distributions.
# Check for multi-modal distribution using kernel density estimation
from scipy.stats import gaussian_kde
# Ensure required variables exist (in case cells are run out of order)
if 'npts' not in globals():
npts = len(x)
if 'mu_fit' not in globals() or 'sigma_fit' not in globals():
mu_fit, sigma_fit = norm.fit(x)
# Kernel density estimation (smooth density function)
kde = gaussian_kde(x)
x_kde = np.linspace(np.min(x), np.max(x), 200)
density = kde(x_kde)
# Find peaks in the density
from scipy.signal import find_peaks
peaks, properties = find_peaks(density, height=0.001)
# Check for long-tail (compare with Gaussian)
# Create x_range if not exists
x_range = np.linspace(np.min(x), np.max(x), 100)
gaussian_curve = norm.pdf(x_range, mu_fit, sigma_fit)
data_tail = kde(x_range) # KDE evaluated at x_range points
plt.figure(figsize=(14, 5))
# Plot 1: KDE with peaks
plt.subplot(1, 2, 1)
plt.hist(x, bins=npts//50, density=True, alpha=0.5, color='lightblue', label='Histogram')
plt.plot(x_kde, density, 'b-', linewidth=2, label='Kernel Density Estimation')
if len(peaks) > 0:
plt.plot(x_kde[peaks], density[peaks], 'ro', markersize=8, label=f'{len(peaks)} peaks found')
plt.xlabel('Average Traffic Index')
plt.ylabel('Density')
plt.title('Kernel Density Estimation (Multi-modal Check)')
plt.legend()
plt.grid(True, alpha=0.3)
# Plot 2: Long-tail check (log scale)
plt.subplot(1, 2, 2)
plt.hist(x, bins=npts//50, density=True, alpha=0.5, color='lightblue', label='Data histogram')
plt.plot(x_range, gaussian_curve, 'r-', linewidth=2, label='Gaussian fit')
plt.plot(x_range, data_tail, 'g--', linewidth=2, alpha=0.7, label='KDE (data)')
plt.xlabel('Average Traffic Index')
plt.ylabel('Density (log scale)')
plt.title('Long-tail Distribution Check')
plt.yscale('log')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Number of peaks found: {len(peaks)}")
if len(peaks) > 1:
print("→ Multi-modal distribution detected!")
else:
print("→ Unimodal distribution")
# Check skewness and kurtosis
skewness = stats.skew(x)
kurtosis = stats.kurtosis(x)
print(f"\nSkewness: {skewness:.4f} (positive = right tail, negative = left tail)")
print(f"Kurtosis: {kurtosis:.4f} (normal = 0, positive = heavy tail, negative = light tail)")
if abs(skewness) > 0.5:
print(f" → Distribution is {'right' if skewness > 0 else 'left'}-skewed")
if abs(kurtosis) > 0.5:
print(f" → Distribution has {'heavy' if kurtosis > 0 else 'light'} tails compared to normal")
Number of peaks found: 1 → Unimodal distribution Skewness: -0.6759 (positive = right tail, negative = left tail) Kurtosis: 0.6002 (normal = 0, positive = heavy tail, negative = light tail) → Distribution is left-skewed → Distribution has heavy tails compared to normal
8. Central Limit Theorem Demonstration¶
Show how the distribution of sample means converges to a Gaussian as sample size increases.
# Central Limit Theorem: sample means approach Gaussian
# CLT states: as N increases, distribution of sample means → N(μ, σ²/N)
# Ensure required variables exist (in case cells are run out of order)
if 'mean_x' not in globals() or 'stddev_x' not in globals():
mean_x = np.mean(x)
stddev_x = np.std(x, ddof=0)
trials = 100
max_sample_size = min(500, len(x)//2) # Increased from 200
sample_sizes = np.arange(10, max_sample_size, 30)
means = np.zeros((trials, len(sample_sizes)))
# Loop over sample sizes
for i, n_samples in enumerate(sample_sizes):
# Loop over trials
for trial in range(trials):
# Random sample from data (with replacement)
sample = np.random.choice(x, size=n_samples, replace=True)
means[trial, i] = np.mean(sample)
# Calculate theoretical standard deviation of mean: σ_mean = σ / √N
theoretical_std = stddev_x / np.sqrt(sample_sizes)
# Plot
plt.figure(figsize=(12, 6))
plt.plot(sample_sizes, mean_x + theoretical_std, 'r--', linewidth=2, label='μ ± σ/√N (theoretical)')
plt.plot(sample_sizes, mean_x - theoretical_std, 'r--', linewidth=2)
# Plot estimated means and stddevs
mean_of_means = np.mean(means, axis=0)
stddev_of_means = np.std(means, axis=0)
plt.errorbar(sample_sizes, mean_of_means, yerr=stddev_of_means,
fmt='ko', capsize=5, markersize=6, label='Estimated mean ± std')
# Plot individual points (sample for visibility)
for i, n_samples in enumerate(sample_sizes):
sample_indices = np.random.choice(trials, size=min(20, trials), replace=False)
plt.plot(np.full(len(sample_indices), n_samples), means[sample_indices, i],
'o', markersize=2, alpha=0.2, color='gray')
plt.xlabel('Number of Samples (N)')
plt.ylabel('Mean Estimates')
plt.title('Central Limit Theorem: Distribution of Sample Means')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print("Central Limit Theorem:")
print(" As sample size N increases, the distribution of sample means")
print(" converges to a Gaussian distribution with:")
print(f" Mean: μ = {mean_x:.4f}")
print(f" Std: σ/√N (decreases as 1/√N)")
print(f" Error bars show: estimated std dev of means ≈ σ/√N")
Central Limit Theorem:
As sample size N increases, the distribution of sample means
converges to a Gaussian distribution with:
Mean: μ = 27.8308
Std: σ/√N (decreases as 1/√N)
Error bars show: estimated std dev of means ≈ σ/√N
9. Summary¶
Key Findings:¶
Distribution: Unimodal, left-skewed, non-normal
Statistics:
- Mean: 27.83, Std: 8.25, Variance: 68.08
Normality: Data does not follow Gaussian distribution (p < 0.05)
Correlations:
- Strong: max ↔ avg (ρ = 0.90)
- Moderate: min ↔ avg (ρ = 0.29)
- Weak: min ↔ max (ρ = 0.10)
Characteristics: Left-skewed (-0.68), heavy-tailed (kurtosis = 0.60)
CLT: Sample means converge to Gaussian as N increases