Fitting the Supply Chain Emission with Margin Factors¶
In [16]:
import pandas as pd
import plotly.express as px
import numpy as np
# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("datasets/GHGEmissionFactors_1.csv")
# -----------------------------
# 2. OFFICIAL NAICS SECTOR NAMES (2-digit)
# -----------------------------
sector_names = {
"11": "Agriculture, Forestry, Fishing & Hunting",
"21": "Mining, Quarrying, Oil & Gas Extraction",
"22": "Utilities",
"23": "Construction",
"31": "Manufacturing",
"32": "Manufacturing",
"33": "Manufacturing",
"42": "Wholesale Trade",
"44": "Retail Trade",
"45": "Retail Trade",
"48": "Transportation & Warehousing",
"49": "Transportation & Warehousing",
"51": "Information",
"52": "Finance & Insurance",
"53": "Real Estate & Rental & Leasing",
"54": "Professional, Scientific & Technical Services",
"55": "Management of Companies",
"56": "Administrative & Support",
"61": "Educational Services",
"62": "Health Care & Social Assistance",
"71": "Arts, Entertainment & Recreation",
"72": "Accommodation & Food Services",
"81": "Other Services",
"92": "Public Administration"
}
# -----------------------------
# 3. Extract 2-digit sector code
# -----------------------------
df["SectorCode"] = df["2017 NAICS Code"].astype(str).str[:2]
# -----------------------------
# 4. Combine repeated sectors
# -----------------------------
# Map all duplicate sector codes to a single code
combine_codes = {
"32": "31", # Manufacturing
"33": "31", # Manufacturing
"45": "44", # Retail Trade
"49": "48" # Transportation & Warehousing
}
df["SectorCode"] = df["SectorCode"].replace(combine_codes)
# -----------------------------
# 5. Add sector names
# -----------------------------
df["SectorName"] = df["SectorCode"].map(sector_names)
# -----------------------------
# 6. Aggregate SEF
# -----------------------------
agg_df = df.groupby(["SectorCode", "SectorName"], as_index=False).agg({
"Supply Chain Emission Factors with Margins": "sum"
})
# -----------------------------
# 7. Scatter plot with polynomial fit
# -----------------------------
# Convert x-axis to numeric for polynomial fitting
x_numeric = np.arange(len(agg_df))
y = agg_df["Supply Chain Emission Factors with Margins"].values
# Fit a 2nd-degree polynomial
coeffs = np.polyfit(x_numeric, y, 2)
y_fit = np.polyval(coeffs, x_numeric)
# Plot
fig = px.scatter(
agg_df,
x="SectorName",
y="Supply Chain Emission Factors with Margins",
title="Emission Share by Sector (MEF+SEF)"
)
# Add polynomial fit line
fig.add_traces(px.line(
x=agg_df["SectorName"],
y=y_fit
).data)
fig.data[1].name = "Polynomial Fit" # Rename the fit line
fig.show()
In [21]:
import pandas as pd
import plotly.express as px
import numpy as np
from scipy.optimize import curve_fit
# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("datasets/GHGEmissionFactors_1.csv")
# -----------------------------
# 2. OFFICIAL NAICS SECTOR NAMES (2-digit)
# -----------------------------
sector_names = {
"11": "Agriculture, Forestry, Fishing & Hunting",
"21": "Mining, Quarrying, Oil & Gas Extraction",
"22": "Utilities",
"23": "Construction",
"31": "Manufacturing",
"32": "Manufacturing",
"33": "Manufacturing",
"42": "Wholesale Trade",
"44": "Retail Trade",
"45": "Retail Trade",
"48": "Transportation & Warehousing",
"49": "Transportation & Warehousing",
"51": "Information",
"52": "Finance & Insurance",
"53": "Real Estate & Rental & Leasing",
"54": "Professional, Scientific & Technical Services",
"55": "Management of Companies",
"56": "Administrative & Support",
"61": "Educational Services",
"62": "Health Care & Social Assistance",
"71": "Arts, Entertainment & Recreation",
"72": "Accommodation & Food Services",
"81": "Other Services",
"92": "Public Administration"
}
# -----------------------------
# 3. Extract 2-digit sector code
# -----------------------------
df["SectorCode"] = df["2017 NAICS Code"].astype(str).str[:2]
# -----------------------------
# 4. Combine repeated sectors
# -----------------------------
combine_codes = {
"32": "31", # Manufacturing
"33": "31", # Manufacturing
"45": "44", # Retail Trade
"49": "48" # Transportation & Warehousing
}
df["SectorCode"] = df["SectorCode"].replace(combine_codes)
# -----------------------------
# 5. Add sector names
# -----------------------------
df["SectorName"] = df["SectorCode"].map(sector_names)
# -----------------------------
# 6. Aggregate SEF+MEF
# -----------------------------
agg_df = df.groupby(["SectorCode", "SectorName"], as_index=False).agg({
"Supply Chain Emission Factors with Margins": "sum"
})
# -----------------------------
# 7. Define a non-linear function
# -----------------------------
# Example: exponential + linear trend
def func(x, a, b, c):
return a * np.exp(b * x) + c
x_numeric = np.arange(len(agg_df))
y = agg_df["Supply Chain Emission Factors with Margins"].values
# Fit using non-linear least squares
popt, pcov = curve_fit(func, x_numeric, y, p0=(1, 0.01, 1)) # initial guesses
y_fit = func(x_numeric, *popt)
# -----------------------------
# 8. Plot scatter + fitted curve
# -----------------------------
fig = px.scatter(
agg_df,
x="SectorName",
y="Supply Chain Emission Factors with Margins",
title="SEF + MEF by Sector with Non-linear Fit",
labels={"y": "SEF + MEF"}
)
# Add fitted non-linear curve
fig.add_traces(px.line(
x=agg_df["SectorName"],
y=y_fit
).data)
fig.data[1].name = "Non-linear Fit (Exp + Linear)"
fig.show()
# -----------------------------
# 9. Optional: print fitted parameters
# -----------------------------
print("Fitted parameters: a = {:.3f}, b = {:.3f}, c = {:.3f}".format(*popt))
Fitted parameters: a = 60.555, b = -0.050, c = -24.966
In [23]:
import pandas as pd
import plotly.express as px
import numpy as np
from scipy.optimize import curve_fit
# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("datasets/GHGEmissionFactors_1.csv")
# -----------------------------
# 2. Official NAICS sector names
# -----------------------------
sector_names = {
"11": "Agriculture, Forestry, Fishing & Hunting",
"21": "Mining, Quarrying, Oil & Gas Extraction",
"22": "Utilities",
"23": "Construction",
"31": "Manufacturing",
"32": "Manufacturing",
"33": "Manufacturing",
"42": "Wholesale Trade",
"44": "Retail Trade",
"45": "Retail Trade",
"48": "Transportation & Warehousing",
"49": "Transportation & Warehousing",
"51": "Information",
"52": "Finance & Insurance",
"53": "Real Estate & Rental & Leasing",
"54": "Professional, Scientific & Technical Services",
"55": "Management of Companies",
"56": "Administrative & Support",
"61": "Educational Services",
"62": "Health Care & Social Assistance",
"71": "Arts, Entertainment & Recreation",
"72": "Accommodation & Food Services",
"81": "Other Services",
"92": "Public Administration"
}
# -----------------------------
# 3. Extract 2-digit sector code
# -----------------------------
df["SectorCode"] = df["2017 NAICS Code"].astype(str).str[:2]
# -----------------------------
# 4. Combine repeated sectors
# -----------------------------
combine_codes = {
"32": "31",
"33": "31",
"45": "44",
"49": "48"
}
df["SectorCode"] = df["SectorCode"].replace(combine_codes)
# -----------------------------
# 5. Add sector names
# -----------------------------
df["SectorName"] = df["SectorCode"].map(sector_names)
# -----------------------------
# 6. Aggregate SEF + MEF
# -----------------------------
agg_df = df.groupby(["SectorCode", "SectorName"], as_index=False).agg({
"Supply Chain Emission Factors with Margins": "sum"
})
# -----------------------------
# 7. Define Gaussian with fixed baseline
# -----------------------------
def gaussian_fixed_baseline(x, a, mu, sigma):
b = min(y) # baseline fixed to minimum value
return a * np.exp(-(x - mu)**2 / (2 * sigma**2)) + b
x_numeric = np.arange(len(agg_df))
y = agg_df["Supply Chain Emission Factors with Margins"].values
# Initial guesses: a = peak height, mu = peak index, sigma = width
p0 = [max(y)-min(y), np.argmax(y), 2]
# Fit Gaussian
popt, pcov = curve_fit(gaussian_fixed_baseline, x_numeric, y, p0=p0)
y_fit = gaussian_fixed_baseline(x_numeric, *popt)
# -----------------------------
# 8. Plot scatter + Gaussian fit
# -----------------------------
fig = px.scatter(
agg_df,
x="SectorName",
y="Supply Chain Emission Factors with Margins",
title="SEF + MEF by Sector with Gaussian Fit (Fixed Baseline)",
labels={"y": "SEF + MEF"}
)
# Add Gaussian fit line
fig.add_traces(px.line(
x=agg_df["SectorName"],
y=y_fit
).data)
fig.data[1].name = "Gaussian Fit (Fixed Baseline)"
fig.show()
# -----------------------------
# 9. Print fitted parameters
# -----------------------------
print("Fitted parameters (fixed baseline):")
print("Amplitude (a) =", popt[0])
print("Peak center (mu) =", popt[1])
print("Width (sigma) =", popt[2])
print("Baseline (b) =", min(y))
Fitted parameters (fixed baseline): Amplitude (a) = 125.83035383901439 Peak center (mu) = 4.018145786554115 Width (sigma) = 0.42107700515431074 Baseline (b) = 0.252
In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("datasets/BTC_USD_full_data.csv")
# Clean the data
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])
df['DayIndex'] = (df['Date'] - df['Date'].min()).dt.days
X = df['DayIndex'].values
y = df['Close'].values
plt.figure(figsize=(14,6))
plt.scatter(df['Date'], y, s=10, label="Data points", alpha=0.7)
plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("Bitcoin Price Data Points")
plt.legend()
plt.show()
# Fit a cubic polynomial (more stable)
coeffs = np.polyfit(X, y, 4)
poly = np.poly1d(coeffs)
plt.figure(figsize=(14,6))
# data points
plt.scatter(df['Date'], y, s=10, alpha=0.6, label="Data points")
# fitted curve
plt.plot(df['Date'], poly(X), color='red', linewidth=2, label="Fit curve (degree 3)")
plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("BTC Data Points with Fitted Curve")
plt.legend()
plt.show()
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import RBFInterpolator
# Load and clean data
df = pd.read_csv("datasets/BTC_USD_full_data.csv")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])
df['DayIndex'] = (df['Date'] - df['Date'].min()).dt.days
X = df['DayIndex'].values.reshape(-1, 1)
y = df['Close'].values
# ----- RBF FIT -----
rbf = RBFInterpolator(X, y, kernel='gaussian', epsilon=50)
# Smooth X for smooth curve
X_smooth = np.linspace(X.min(), X.max(), 2000).reshape(-1, 1)
y_smooth = rbf(X_smooth)
# ----- Plot -----
plt.figure(figsize=(14,6))
# Data points
plt.scatter(df['Date'], y, s=10, alpha=0.4, label="Data Points")
# RBF curve
dates_smooth = df['Date'].min() + pd.to_timedelta(X_smooth.flatten(), unit='D')
plt.plot(dates_smooth, y_smooth, color='red', linewidth=2, label="RBF Fit")
plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("RBF Fit for BTC Closing Price")
plt.legend()
plt.show()
Guasian Smoothing¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
# --- Load and clean data ---
df = pd.read_csv("datasets/BTC_USD_full_data.csv")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])
# --- Apply Gaussian smoothing ---
sigma = 20 # Adjust smoothness (10 = sharp, 20 = smooth, 40 = very smooth)
y_smooth = gaussian_filter1d(df['Close'].values, sigma=sigma)
# --- Plot ---
plt.figure(figsize=(14,6))
# Original data
plt.scatter(df['Date'], df['Close'], s=8, alpha=0.3, label="Raw Data")
# Smoothed curve
plt.plot(df['Date'], y_smooth, color='red', linewidth=2, label=f"Gaussian Smoothing (sigma={sigma})")
plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("Gaussian Smoothing on BTC Closing Price")
plt.legend()
plt.grid(alpha=0.3)
plt.show()
References¶
- MathWorks. (n.d.). Radial basis function networks. https://www.mathworks.com/help/deeplearning/ug/radial-basis-function-networks.html
- Regmi, S. (2021, May 30). Gaussian smoothing in time series data. Medium (TDS Archive). Retrieved from https://medium.com/data-science/gaussian-smoothing-in-time-series-data‑c6801f8a4dc3