import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal, stats, interpolate, optimize
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

NumPy version: 2.3.3
Pandas version: 2.3.3

df = pd.read_csv('datasets/traffic_index.csv')
df['trafficindexdate'] = pd.to_datetime(df['trafficindexdate'], format='mixed')
df = df.sort_values('trafficindexdate').reset_index(drop=True)
print(f"Loaded {len(df)} observations")

Loaded 3332 observations

# Convert to NumPy array
traffic_array = df['average_traffic_index'].values

print("NumPy Array Properties:")
print(f"Shape: {traffic_array.shape}")
print(f"Data type: {traffic_array.dtype}")
print(f"Memory: {traffic_array.nbytes / 1024:.2f} KB")
print(f"\nFirst 10 values: {traffic_array[:10]}")

# Statistical operations (vectorized)
print(f"\nVectorized Statistics:")
print(f"Mean: {np.mean(traffic_array):.2f}")
print(f"Std: {np.std(traffic_array):.2f}")
print(f"Median: {np.median(traffic_array):.2f}")
print(f"95th percentile: {np.percentile(traffic_array, 95):.2f}")

NumPy Array Properties:
Shape: (3332,)
Data type: float64
Memory: 26.03 KB

First 10 values: [57.85811578 23.7704918  38.60126582 29.71527778 28.55749129 32.3
 24.18402778 10.59027778 25.7456446  31.42708333]

Vectorized Statistics:
Mean: 27.83
Std: 8.25
Median: 28.98
95th percentile: 39.15

# Get recent data
recent = df[df['trafficindexdate'] >= '2024-01-01'].copy()
recent = recent.set_index('trafficindexdate')['average_traffic_index']

# Different smoothing methods
window = 7
moving_avg = recent.rolling(window=window, center=True).mean()

# Savitzky-Golay filter (polynomial smoothing)
savgol = signal.savgol_filter(recent.values, window_length=21, polyorder=3)

# Plot comparison
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(recent.index, recent.values, 'o', alpha=0.3, markersize=3, label='Raw data', color='lightgray')
ax.plot(moving_avg.index, moving_avg.values, linewidth=2, label=f'{window}-day Moving Avg', color='#4472C4')
ax.plot(recent.index, savgol, linewidth=2, label='Savitzky-Golay', color='#ED7D31')

ax.set_title('Signal Processing: Smoothing Noisy Traffic Data', fontsize=13, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Traffic Index')
ax.legend(loc='upper left', frameon=False)
ax.grid(True, alpha=0.3)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.show()

# Use full dataset for frequency analysis
traffic_signal = df['average_traffic_index'].values
N = len(traffic_signal)

# FFT
fft = np.fft.fft(traffic_signal)
freq = np.fft.fftfreq(N, d=1)  # d=1 means daily sampling

# Power spectrum
power = np.abs(fft)**2

# Find dominant frequencies (skip DC component at freq=0)
mask = freq > 0
freq_positive = freq[mask]
power_positive = power[mask]

# Convert to periods (days)
periods = 1 / freq_positive

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Focus on weekly/monthly patterns
mask_range = (periods >= 3) & (periods <= 60)

ax1.plot(periods[mask_range], power_positive[mask_range], linewidth=1.5, color='#4472C4')
ax1.axvline(x=7, color='#ED7D31', linestyle='--', linewidth=2, label='Weekly (7 days)')
ax1.axvline(x=30, color='#70AD47', linestyle='--', linewidth=2, label='Monthly (30 days)')
ax1.set_xlabel('Period (days)')
ax1.set_ylabel('Power')
ax1.set_title('Frequency Analysis: Dominant Patterns', fontsize=12, fontweight='bold')
ax1.legend(frameon=False)
ax1.grid(True, alpha=0.3)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# Find peaks
peaks, properties = signal.find_peaks(power_positive[mask_range], height=np.max(power_positive[mask_range])*0.1)
peak_periods = periods[mask_range][peaks]
peak_powers = power_positive[mask_range][peaks]

# Show top 5 periods
top_indices = np.argsort(peak_powers)[-5:]
top_periods = peak_periods[top_indices]
top_powers = peak_powers[top_indices]

ax2.barh(range(len(top_periods)), top_powers, color='#4472C4')
ax2.set_yticks(range(len(top_periods)))
ax2.set_yticklabels([f'{p:.1f} days' for p in top_periods])
ax2.set_xlabel('Power')
ax2.set_title('Top 5 Periodic Patterns', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

print("Detected Periods:")
for period, power in zip(top_periods, top_powers):
    print(f"  {period:.1f} days (power: {power:.0f})")

Detected Periods:
  3.5 days (power: 4098257)
  3.5 days (power: 11664688)
  7.0 days (power: 33844124)

# Create sample with gaps
sample = df.iloc[::10].copy()  # Every 10th day
x = np.arange(len(sample))
y = sample['average_traffic_index'].values

# Dense grid for interpolation
x_dense = np.linspace(0, len(sample)-1, len(sample)*10)

# Different interpolation methods
f_linear = interpolate.interp1d(x, y, kind='linear')
f_cubic = interpolate.interp1d(x, y, kind='cubic')
f_spline = interpolate.UnivariateSpline(x, y, s=10)

# Plot
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(x, y, 'o', markersize=8, label='Original data', color='black', zorder=5)
ax.plot(x_dense, f_linear(x_dense), '--', linewidth=2, label='Linear', alpha=0.7, color='#636363')
ax.plot(x_dense, f_cubic(x_dense), linewidth=2, label='Cubic', color='#4472C4')
ax.plot(x_dense, f_spline(x_dense), linewidth=2, label='Spline', color='#ED7D31')

ax.set_xlabel('Sample Index')
ax.set_ylabel('Traffic Index')
ax.set_title('Interpolation Methods Comparison', fontsize=13, fontweight='bold')
ax.legend(frameon=False)
ax.grid(True, alpha=0.3)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

# Test: Are weekday and weekend traffic distributions different?
weekday_traffic = df[df['trafficindexdate'].dt.dayofweek < 5]['average_traffic_index']
weekend_traffic = df[df['trafficindexdate'].dt.dayofweek >= 5]['average_traffic_index']

# T-test
t_stat, p_value = stats.ttest_ind(weekday_traffic, weekend_traffic)

# Mann-Whitney U test (non-parametric)
u_stat, p_value_mw = stats.mannwhitneyu(weekday_traffic, weekend_traffic)

# Kolmogorov-Smirnov test (distribution comparison)
ks_stat, p_value_ks = stats.ks_2samp(weekday_traffic, weekend_traffic)

print("Statistical Tests: Weekday vs Weekend Traffic\n")
print(f"Weekday mean: {weekday_traffic.mean():.2f}")
print(f"Weekend mean: {weekend_traffic.mean():.2f}")
print(f"Difference: {weekday_traffic.mean() - weekend_traffic.mean():.2f}\n")

print("T-test (parametric):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.2e}")
print(f"  Significant: {'Yes' if p_value < 0.05 else 'No'}\n")

print("Mann-Whitney U (non-parametric):")
print(f"  U-statistic: {u_stat:.0f}")
print(f"  p-value: {p_value_mw:.2e}")
print(f"  Significant: {'Yes' if p_value_mw < 0.05 else 'No'}\n")

print("Kolmogorov-Smirnov:")
print(f"  KS-statistic: {ks_stat:.4f}")
print(f"  p-value: {p_value_ks:.2e}")
print(f"  Distributions different: {'Yes' if p_value_ks < 0.05 else 'No'}")

Statistical Tests: Weekday vs Weekend Traffic

Weekday mean: 30.18
Weekend mean: 21.94
Difference: 8.25

T-test (parametric):
  t-statistic: 29.1902
  p-value: 5.48e-167
  Significant: Yes

Mann-Whitney U (non-parametric):
  U-statistic: 1796490
  p-value: 1.26e-154
  Significant: Yes

Kolmogorov-Smirnov:
  KS-statistic: 0.4844
  p-value: 1.70e-145
  Distributions different: Yes

# Find peak traffic days using optimization
recent_2024 = df[df['trafficindexdate'] >= '2024-01-01'].copy()
recent_2024['day_num'] = (recent_2024['trafficindexdate'] - recent_2024['trafficindexdate'].min()).dt.days

x = recent_2024['day_num'].values
y = recent_2024['average_traffic_index'].values

# Fit polynomial to find overall trend
def polynomial(x, a, b, c, d):
    return a*x**3 + b*x**2 + c*x + d

# Fit using curve_fit
params, _ = optimize.curve_fit(polynomial, x, y, p0=[0, 0, 0, np.mean(y)])
y_fit = polynomial(x, *params)

# Find peaks in residuals
residuals = y - y_fit
peaks, properties = signal.find_peaks(residuals, prominence=5, distance=7)

# Plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

# Original with trend
ax1.scatter(x, y, alpha=0.3, s=20, color='#d3d3d3', label='Daily data')
ax1.plot(x, y_fit, 'r-', linewidth=2, label='Polynomial fit', color='#ED7D31')
ax1.set_ylabel('Traffic Index')
ax1.set_title('Traffic Pattern with Polynomial Trend', fontsize=12, fontweight='bold')
ax1.legend(frameon=False)
ax1.grid(True, alpha=0.3)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# Residuals with peaks
ax2.plot(x, residuals, alpha=0.7, color='#636363', linewidth=1)
ax2.plot(x[peaks], residuals[peaks], 'o', markersize=8, color='#ED7D31', label=f'{len(peaks)} peak days')
ax2.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
ax2.set_xlabel('Days since Jan 1, 2024')
ax2.set_ylabel('Residual (Deviation from trend)')
ax2.set_title('Peak Traffic Days Detection', fontsize=12, fontweight='bold')
ax2.legend(frameon=False)
ax2.grid(True, alpha=0.3)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

# Show peak dates
peak_dates = recent_2024.iloc[peaks]['trafficindexdate']
print(f"\nDetected {len(peaks)} peak traffic days in 2024:")
for date, residual in zip(peak_dates.head(10), residuals[peaks][:10]):
    print(f"  {date.strftime('%Y-%m-%d %A')}: +{residual:.1f} above trend")

Detected 28 peak traffic days in 2024:
  2024-01-09 Tuesday: +12.3 above trend
  2024-01-20 Saturday: +5.3 above trend
  2024-01-29 Monday: +9.0 above trend
  2024-02-15 Thursday: +13.4 above trend
  2024-02-29 Thursday: +4.1 above trend
  2024-03-08 Friday: +16.2 above trend
  2024-03-20 Wednesday: +5.1 above trend
  2024-03-28 Thursday: +10.6 above trend
  2024-04-04 Thursday: +6.4 above trend
  2024-04-11 Thursday: +2.6 above trend

# Create feature matrix
features = df[['minimum_traffic_index', 'maximum_traffic_index', 'average_traffic_index']].values

# Correlation matrix using NumPy
corr_matrix = np.corrcoef(features.T)

# Plot
fig, ax = plt.subplots(figsize=(8, 6))

im = ax.imshow(corr_matrix, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)
ax.set_xticks([0, 1, 2])
ax.set_yticks([0, 1, 2])
ax.set_xticklabels(['Min', 'Max', 'Avg'])
ax.set_yticklabels(['Min', 'Max', 'Avg'])

# Add correlation values
for i in range(3):
    for j in range(3):
        text = ax.text(j, i, f'{corr_matrix[i, j]:.3f}',
                      ha="center", va="center", color="black", fontsize=12, fontweight='bold')

ax.set_title('Correlation Matrix: Traffic Indices', fontsize=13, fontweight='bold', pad=15)
plt.colorbar(im, ax=ax, label='Correlation')
plt.tight_layout()
plt.show()

print("Correlation Analysis:")
print(f"Min-Max correlation: {corr_matrix[0,1]:.3f}")
print(f"Min-Avg correlation: {corr_matrix[0,2]:.3f}")
print(f"Max-Avg correlation: {corr_matrix[1,2]:.3f}")

Correlation Analysis:
Min-Max correlation: 0.102
Min-Avg correlation: 0.292
Max-Avg correlation: 0.903

import time

# Test data
data = df['average_traffic_index'].values

# Pure Python
start = time.time()
python_mean = sum(data) / len(data)
python_std = (sum((x - python_mean)**2 for x in data) / len(data))**0.5
python_time = time.time() - start

# NumPy
start = time.time()
numpy_mean = np.mean(data)
numpy_std = np.std(data)
numpy_time = time.time() - start

# Results
print("Performance Comparison: Pure Python vs NumPy\n")
print(f"Dataset size: {len(data):,} values\n")
print("Pure Python:")
print(f"  Time: {python_time*1000:.3f} ms")
print(f"  Mean: {python_mean:.6f}")
print(f"  Std:  {python_std:.6f}\n")
print("NumPy (vectorized):")
print(f"  Time: {numpy_time*1000:.3f} ms")
print(f"  Mean: {numpy_mean:.6f}")
print(f"  Std:  {numpy_std:.6f}\n")
print(f"Speedup: {python_time/numpy_time:.1f}x faster")

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
methods = ['Pure Python', 'NumPy']
times = [python_time*1000, numpy_time*1000]
colors = ['#d3d3d3', '#70AD47']

bars = ax.barh(methods, times, color=colors)
ax.set_xlabel('Time (milliseconds)', fontsize=11)
ax.set_title(f'NumPy is {python_time/numpy_time:.1f}x Faster for Array Operations', 
             fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add time labels
for bar, time in zip(bars, times):
    ax.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2,
            f'{time:.3f} ms', va='center', fontsize=10)

plt.tight_layout()
plt.show()

Performance Comparison: Pure Python vs NumPy

Dataset size: 3,332 values

Pure Python:
  Time: 1.797 ms
  Mean: 27.830846
  Std:  8.251340

NumPy (vectorized):
  Time: 0.299 ms
  Mean: 27.830846
  Std:  8.251340

Speedup: 6.0x faster

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Feature scaling
features = df[['minimum_traffic_index', 'maximum_traffic_index', 'average_traffic_index']].values
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

print("StandardScaler: mean=0, std=1")
print(f"Before: mean={features.mean(axis=0)[0]:.2f}, std={features.std(axis=0)[0]:.2f}")
print(f"After:  mean={features_scaled.mean(axis=0)[0]:.2f}, std={features_scaled.std(axis=0)[0]:.2f}")

StandardScaler: mean=0, std=1
Before: mean=2.08, std=2.75
After:  mean=-0.00, std=1.00

# PCA: Reduce 3D to 2D
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_scaled)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# PCA visualization
scatter = ax1.scatter(features_pca[:, 0], features_pca[:, 1], 
                      c=df['average_traffic_index'], cmap='viridis', alpha=0.5, s=10)
ax1.set_xlabel('PC1')
ax1.set_ylabel('PC2')
ax1.set_title('PCA: 3 features → 2 dimensions', fontweight='bold')
plt.colorbar(scatter, ax=ax1, label='Avg Traffic')

# Variance explained
ax2.bar([1, 2], pca.explained_variance_ratio_, color='#4472C4')
ax2.set_xlabel('Component')
ax2.set_ylabel('Variance Explained')
ax2.set_title(f'Total: {pca.explained_variance_ratio_.sum():.1%} variance explained', fontweight='bold')
for i, v in enumerate(pca.explained_variance_ratio_):
    ax2.text(i+1, v+0.02, f'{v:.1%}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# K-means clustering
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(features_pca)

fig, ax = plt.subplots(figsize=(10, 6))
scatter = ax.scatter(features_pca[:, 0], features_pca[:, 1], 
                     c=clusters, cmap='Set1', alpha=0.5, s=20)
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
           marker='X', s=300, c='black', edgecolors='yellow', linewidth=2, label='Centroids')

ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('K-Means: 3 Traffic Patterns', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nCluster Statistics:")
for i in range(3):
    cluster_data = df[clusters == i]['average_traffic_index']
    print(f"Cluster {i}: n={len(cluster_data):4d}, mean={cluster_data.mean():.1f}, std={cluster_data.std():.1f}")

Cluster Statistics:
Cluster 0: n=2284, mean=30.9, std=4.7
Cluster 1: n= 768, mean=16.2, std=5.9
Cluster 2: n= 280, mean=34.7, std=6.2

import altair as alt

# Prepare data with year column
df_viz = df.copy()
df_viz['year'] = df_viz['trafficindexdate'].dt.year
df_viz['month'] = df_viz['trafficindexdate'].dt.month

monthly = df_viz.groupby(['year', 'month'])['average_traffic_index'].mean().reset_index()
monthly_recent = monthly[monthly['year'] >= 2022]

# Altair: Declarative style
chart = alt.Chart(monthly_recent).mark_line(point=True).encode(
    x=alt.X('month:O', title='Month'),
    y=alt.Y('average_traffic_index:Q', title='Traffic Index'),
    color=alt.Color('year:N', legend=alt.Legend(title='Year')),
    tooltip=['year:N', 'month:O', 'average_traffic_index:Q']
).properties(
    width=600,
    height=400,
    title='Interactive: Hover to see values'
).interactive()

chart

# Compare: Matplotlib vs Altair for same chart
import matplotlib.pyplot as plt

# MATPLOTLIB: ~15 lines
fig, ax = plt.subplots(figsize=(10, 6))
for year in monthly_recent['year'].unique():
    data = monthly_recent[monthly_recent['year'] == year]
    ax.plot(data['month'], data['average_traffic_index'], 
            marker='o', label=str(year), linewidth=2)
ax.set_xlabel('Month')
ax.set_ylabel('Traffic Index')
ax.set_title('Traffic by Month and Year')
ax.legend(title='Year')
ax.grid(True, alpha=0.3)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.show()

print("Matplotlib: 15+ lines, imperative style")
print("Altair: 8 lines, declarative + interactive")

Matplotlib: 15+ lines, imperative style
Altair: 8 lines, declarative + interactive

# Layered chart: Bar + Line
yearly_data = df_viz.groupby('year').agg({
    'average_traffic_index': 'mean',
    'maximum_traffic_index': 'max'
}).reset_index()

base = alt.Chart(yearly_data)

bars = base.mark_bar(opacity=0.7, color='#4472C4').encode(
    x=alt.X('year:O', title='Year'),
    y=alt.Y('average_traffic_index:Q', title='Traffic Index')
)

line = base.mark_line(point=True, color='#ED7D31', strokeWidth=3).encode(
    x='year:O',
    y='maximum_traffic_index:Q'
)

# Combine layers
chart = (bars + line).properties(
    width=600,
    height=400,
    title='Layered Chart: Average (bars) + Max (line)'
).configure_axis(
    labelFontSize=11,
    titleFontSize=12
)

chart

# Selection: Interactive filtering
df_recent = df_viz[df_viz['year'] >= 2020].copy()

brush = alt.selection_interval(encodings=['x'])

upper = alt.Chart(df_recent).mark_line().encode(
    x=alt.X('trafficindexdate:T', title='Date'),
    y=alt.Y('average_traffic_index:Q', title='Traffic Index'),
    opacity=alt.condition(brush, alt.value(1.0), alt.value(0.3))
).properties(
    width=600,
    height=200,
    title='Select a region to zoom'
).add_params(brush)

lower = alt.Chart(df_recent).mark_line().encode(
    x=alt.X('trafficindexdate:T', title='Date', scale=alt.Scale(domain=brush)),
    y=alt.Y('average_traffic_index:Q', title='Traffic Index')
).properties(
    width=600,
    height=200,
    title='Zoomed view'
)

chart = alt.vconcat(upper, lower).configure_axis(
    grid=False
)

chart

Data Science - Week 1: Tools¶

Overview: Python Scientific Stack¶

1. NumPy: Array Operations¶

Basic Array Analysis¶

2. SciPy: Signal Processing¶

Smoothing with Moving Average and Savitzky-Golay Filter¶

3. Fourier Transform: Finding Periodic Patterns¶

4. Interpolation: Filling Missing Data¶

SciPy Interpolation Methods¶

5. Statistical Tests: Hypothesis Testing¶

6. Optimization: Finding Peaks¶

7. Linear Algebra: Correlation Matrix¶

8. Performance Comparison: NumPy vs Pure Python¶

10. Scikit-learn: Machine Learning Tools¶

Preprocessing and clustering¶

11. Altair: Declarative Visualization¶

Vega-Lite based, interactive charts with less code¶

Matplotlib vs Altair¶

Altair Key Features:¶

References¶

Books¶

Online Resources¶

Data Source¶