import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
df = pd.read_csv("datasets/Housing.csv")
df.head()

# Extract the target variable (price)
x = df["price"].values

# Compute mean and standard deviation of your data
mean = np.mean(x)
stddev = np.std(x)

print("Mean price:", mean)
print("Std Dev:", stddev)

# Plot histogram and data points
plt.figure(figsize=(10,5))
plt.hist(x, bins=len(x)//50, density=True, alpha=0.6)

# Data point markers
plt.plot(x, 0*x, '|', ms=len(x)/20)

# Plot Gaussian PDF using estimated mean and stddev
xi = np.linspace(mean - 3*stddev, mean + 3*stddev, 200)
yi = np.exp(-(xi-mean)**2/(2*stddev**2)) / np.sqrt(2*np.pi*stddev**2)

plt.plot(xi, yi, 'r')
plt.xlabel("Price")
plt.ylabel("Probability Density")
plt.title("Gaussian Modeling of Housing Prices")
plt.show()

Mean price: 4766729.247706422
Std Dev: 1868722.8281312082

Interpreatation

The histogram shows the actual distribution of housing prices.
    

The red curve represents a Gaussian distribution fitted using mean and standard deviation.
    

Housing prices roughly follow a normal pattern with some variation.
    

Extremely high or low prices cause small deviations from the Gaussian curve

data = df["price"].values

# True mean and width (stddev) of your dataset
true_mean = np.mean(data)
true_width = np.std(data)

trials = 100
points = np.arange(10, 500, 25)
means = np.zeros((trials, len(points)))

#
# Loop over number of points (samples)
#
for p in range(len(points)):
    N = points[p]

    # For each trial, randomly sample N prices from your dataset
    for trial in range(trials):
        sample = np.random.choice(data, size=N, replace=True)
        means[trial, p] = np.mean(sample)

#
# Plot calculated true_mean ± width / sqrt(N)
#
plt.figure(figsize=(12,6))
plt.plot(points, true_mean + true_width/np.sqrt(points), 'r', label='calculated')
plt.plot(points, true_mean - true_width/np.sqrt(points), 'r')

#
# Plot estimated mean and stddev across trials
#
estimated_mean = np.mean(means, axis=0)
estimated_stddev = np.std(means, axis=0)

plt.errorbar(points, estimated_mean, yerr=estimated_stddev,
             fmt='k-o', capsize=7, label='estimated')

#
# Plot individual trial points
#
for p in range(len(points)):
    plt.plot(np.full(trials, points[p]), means[:, p],
             'o', markersize=2)

plt.xlabel("Number of samples averaged")
plt.ylabel("Estimated mean price")
plt.title("Error Reduction by Averaging (Housing Price Data)")
plt.legend()
plt.show()

Interpretation:


Each point shows the mean price estimated from random samples.
    

When the sample size is small, estimates vary widely.
    

As sample size increases, estimates become more stable.
    

This confirms that larger samples reduce estimation error.

# Set seed and display options
np.random.seed(10)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

# Select two numerical columns: price and area
data = df[['price', 'area']].values

# Number of points
npts = data.shape[0]

# Compute mean and standard deviation for each column
varmean = np.mean(data, axis=0)
varstd = np.sqrt(np.var(data, axis=0))

# Coordinates for plotting variance lines
varplotx = [varmean[0]-varstd[0], varmean[0]+varstd[0], None, varmean[0], varmean[0]]
varploty = [varmean[1], varmean[1], None, varmean[1]-varstd[1], varmean[1]+varstd[1]]

# Compute covariance matrix
covar = np.cov(data, rowvar=False)
covarmean = np.mean(data, axis=0)

# Eigen decomposition
evalu, evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[0,1]*np.sqrt(evalu[1])
dy0 = evect[1,0]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])

# Coordinates for plotting covariance lines
covarplotx = [covarmean[0]-dx0, covarmean[0]+dx0, None, covarmean[0]-dx1, covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0, covarmean[1]-dy0, None, covarmean[1]+dy1, covarmean[1]-dy1]

# Print covariance matrix
print("Covariance matrix:")
print(covar)

# Plot
plt.figure(figsize=(8,6))
plt.hist2d(data[:,0], data[:,1], bins=30, cmap='binary')
plt.plot(data[:,0], data[:,1], 'o', markersize=2, alpha=0.3)
plt.plot(varmean[0], varmean[1], 'ro')
plt.plot(covarmean[0], covarmean[1], 'ro')
plt.plot(varplotx, varploty, 'r')
plt.plot(covarplotx, covarploty, 'r')
plt.text(varmean[0]+0.5e6, varmean[1]-500, "variance", fontsize=12)
plt.text(covarmean[0]+0.5e6, covarmean[1]+500, "covariance", fontsize=12)
plt.axis('off')
plt.show()

Covariance matrix:
[[3.50e+12 2.18e+09]
 [2.18e+09 4.71e+06]]

Interpratation:


The scatter plot shows the relationship between house price and area.
    

Red dot indicates the mean of price and area.
    

Variance lines show spread in each variable independently.
    

Covariance lines show joint variation between price and area.
    

The tilted covariance direction indicates a positive relationship between area and price.

data = df['price'].values  # replace 'price' with any numerical column

# Number of bins for histogram
nbins = 256
xmin = np.min(data)
xmax = np.max(data)
x = np.linspace(xmin, xmax, nbins)

print(f"{nbins} bins = {np.log2(nbins):.0f} bits")

# Function to calculate entropy
def entropy(dist):
    positives = dist[np.where(dist > 0)]  # 0 log(0) = 0
    return -np.sum(positives * np.log2(positives))

# 1. Uniform distribution
uniform = np.ones(nbins) / nbins

# 2. Gaussian distribution fitted to your data
mean = np.mean(data)
std = np.std(data)
normal = np.exp(-(x - mean)**2 / (2 * std**2))
normal = normal / np.sum(normal)  # normalize to sum=1

# 3. One-hot distribution at the middle
onehot = np.zeros(nbins)
onehot[nbins//2] = 1

# Plot
fig, axs = plt.subplots(3, 1, figsize=(8, 6))
fig.canvas.header_visible = False
width = 1.1*(xmax-xmin)/nbins

axs[0].bar(x, uniform)
axs[0].set_title(f"Uniform entropy: {entropy(uniform):.1f} bits")

axs[1].bar(x, normal, width=width)
axs[1].set_title(f"Gaussian entropy (fitted to your data): {entropy(normal):.1f} bits")

axs[2].bar(x, onehot, width=width)
axs[2].set_title(f"One-hot entropy: {entropy(onehot):.1f} bits")

plt.tight_layout()
plt.show()

256 bins = 8 bits

Interpreatation:

Uniform Distribution Entropy:


All outcomes have equal probability.
    

Entropy is highest among all cases.
    

Represents maximum uncertainty.
    

No single value contains more information than another.
    

Gaussian Distribution Entropy:
    

Probability is concentrated near the mean value.
    

Entropy is lower than the uniform distribution.
    

Indicates reduced uncertainty.
    

Prices are more predictable around the average.
    

One-Hot Distribution Entropy:
    

All probability is assigned to one value.
    

Entropy is zero.
    

Represents no uncertainty.
    

The outcome is completely predictable.

	price	area	bedrooms	bathrooms	stories	mainroad	guestroom	basement	hotwaterheating	airconditioning	parking	prefarea	furnishingstatus
0	13300000	7420	4	2	3	yes	no	no	no	yes	2	yes	furnished
1	12250000	8960	4	4	4	yes	no	no	no	yes	3	no	furnished
2	12250000	9960	3	2	2	yes	no	yes	no	no	2	yes	semi-furnished
3	12215000	7500	4	2	2	yes	no	yes	no	yes	3	yes	furnished
4	11410000	7420	4	1	2	yes	yes	yes	no	yes	2	no	furnished