import pandas as pd
import plotly.express as px
import numpy as np

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("datasets/GHGEmissionFactors_1.csv")

# -----------------------------
# 2. OFFICIAL NAICS SECTOR NAMES (2-digit)
# -----------------------------
sector_names = {
    "11": "Agriculture, Forestry, Fishing & Hunting",
    "21": "Mining, Quarrying, Oil & Gas Extraction",
    "22": "Utilities",
    "23": "Construction",
    "31": "Manufacturing",
    "32": "Manufacturing",
    "33": "Manufacturing",
    "42": "Wholesale Trade",
    "44": "Retail Trade",
    "45": "Retail Trade",
    "48": "Transportation & Warehousing",
    "49": "Transportation & Warehousing",
    "51": "Information",
    "52": "Finance & Insurance",
    "53": "Real Estate & Rental & Leasing",
    "54": "Professional, Scientific & Technical Services",
    "55": "Management of Companies",
    "56": "Administrative & Support",
    "61": "Educational Services",
    "62": "Health Care & Social Assistance",
    "71": "Arts, Entertainment & Recreation",
    "72": "Accommodation & Food Services",
    "81": "Other Services",
    "92": "Public Administration"
}

# -----------------------------
# 3. Extract 2-digit sector code
# -----------------------------
df["SectorCode"] = df["2017 NAICS Code"].astype(str).str[:2]

# -----------------------------
# 4. Combine repeated sectors
# -----------------------------
# Map all duplicate sector codes to a single code
combine_codes = {
    "32": "31",  # Manufacturing
    "33": "31",  # Manufacturing
    "45": "44",  # Retail Trade
    "49": "48"   # Transportation & Warehousing
}

df["SectorCode"] = df["SectorCode"].replace(combine_codes)

# -----------------------------
# 5. Add sector names
# -----------------------------
df["SectorName"] = df["SectorCode"].map(sector_names)

# -----------------------------
# 6. Aggregate SEF
# -----------------------------
agg_df = df.groupby(["SectorCode", "SectorName"], as_index=False).agg({
    "Supply Chain Emission Factors with Margins": "sum"
})
# -----------------------------
# 7. Scatter plot with polynomial fit
# -----------------------------
# Convert x-axis to numeric for polynomial fitting
x_numeric = np.arange(len(agg_df))
y = agg_df["Supply Chain Emission Factors with Margins"].values

# Fit a 2nd-degree polynomial
coeffs = np.polyfit(x_numeric, y, 2)
y_fit = np.polyval(coeffs, x_numeric)

# Plot
fig = px.scatter(
    agg_df,
    x="SectorName",
    y="Supply Chain Emission Factors with Margins",
    title="Emission Share by Sector (MEF+SEF)"
)

# Add polynomial fit line
fig.add_traces(px.line(
    x=agg_df["SectorName"],
    y=y_fit
).data)

fig.data[1].name = "Polynomial Fit"  # Rename the fit line
fig.show()

import pandas as pd
import plotly.express as px
import numpy as np
from scipy.optimize import curve_fit

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("datasets/GHGEmissionFactors_1.csv")

# -----------------------------
# 2. OFFICIAL NAICS SECTOR NAMES (2-digit)
# -----------------------------
sector_names = {
    "11": "Agriculture, Forestry, Fishing & Hunting",
    "21": "Mining, Quarrying, Oil & Gas Extraction",
    "22": "Utilities",
    "23": "Construction",
    "31": "Manufacturing",
    "32": "Manufacturing",
    "33": "Manufacturing",
    "42": "Wholesale Trade",
    "44": "Retail Trade",
    "45": "Retail Trade",
    "48": "Transportation & Warehousing",
    "49": "Transportation & Warehousing",
    "51": "Information",
    "52": "Finance & Insurance",
    "53": "Real Estate & Rental & Leasing",
    "54": "Professional, Scientific & Technical Services",
    "55": "Management of Companies",
    "56": "Administrative & Support",
    "61": "Educational Services",
    "62": "Health Care & Social Assistance",
    "71": "Arts, Entertainment & Recreation",
    "72": "Accommodation & Food Services",
    "81": "Other Services",
    "92": "Public Administration"
}

# -----------------------------
# 3. Extract 2-digit sector code
# -----------------------------
df["SectorCode"] = df["2017 NAICS Code"].astype(str).str[:2]

# -----------------------------
# 4. Combine repeated sectors
# -----------------------------
combine_codes = {
    "32": "31",  # Manufacturing
    "33": "31",  # Manufacturing
    "45": "44",  # Retail Trade
    "49": "48"   # Transportation & Warehousing
}
df["SectorCode"] = df["SectorCode"].replace(combine_codes)

# -----------------------------
# 5. Add sector names
# -----------------------------
df["SectorName"] = df["SectorCode"].map(sector_names)

# -----------------------------
# 6. Aggregate SEF+MEF
# -----------------------------
agg_df = df.groupby(["SectorCode", "SectorName"], as_index=False).agg({
    "Supply Chain Emission Factors with Margins": "sum"
})

# -----------------------------
# 7. Define a non-linear function
# -----------------------------
# Example: exponential + linear trend
def func(x, a, b, c):
    return a * np.exp(b * x) + c

x_numeric = np.arange(len(agg_df))
y = agg_df["Supply Chain Emission Factors with Margins"].values

# Fit using non-linear least squares
popt, pcov = curve_fit(func, x_numeric, y, p0=(1, 0.01, 1))  # initial guesses
y_fit = func(x_numeric, *popt)

# -----------------------------
# 8. Plot scatter + fitted curve
# -----------------------------
fig = px.scatter(
    agg_df,
    x="SectorName",
    y="Supply Chain Emission Factors with Margins",
    title="SEF + MEF by Sector with Non-linear Fit",
    labels={"y": "SEF + MEF"}
)

# Add fitted non-linear curve
fig.add_traces(px.line(
    x=agg_df["SectorName"],
    y=y_fit
).data)

fig.data[1].name = "Non-linear Fit (Exp + Linear)"
fig.show()

# -----------------------------
# 9. Optional: print fitted parameters
# -----------------------------
print("Fitted parameters: a = {:.3f}, b = {:.3f}, c = {:.3f}".format(*popt))

Fitted parameters: a = 60.555, b = -0.050, c = -24.966

import pandas as pd
import plotly.express as px
import numpy as np
from scipy.optimize import curve_fit

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("datasets/GHGEmissionFactors_1.csv")

# -----------------------------
# 2. Official NAICS sector names
# -----------------------------
sector_names = {
    "11": "Agriculture, Forestry, Fishing & Hunting",
    "21": "Mining, Quarrying, Oil & Gas Extraction",
    "22": "Utilities",
    "23": "Construction",
    "31": "Manufacturing",
    "32": "Manufacturing",
    "33": "Manufacturing",
    "42": "Wholesale Trade",
    "44": "Retail Trade",
    "45": "Retail Trade",
    "48": "Transportation & Warehousing",
    "49": "Transportation & Warehousing",
    "51": "Information",
    "52": "Finance & Insurance",
    "53": "Real Estate & Rental & Leasing",
    "54": "Professional, Scientific & Technical Services",
    "55": "Management of Companies",
    "56": "Administrative & Support",
    "61": "Educational Services",
    "62": "Health Care & Social Assistance",
    "71": "Arts, Entertainment & Recreation",
    "72": "Accommodation & Food Services",
    "81": "Other Services",
    "92": "Public Administration"
}

# -----------------------------
# 3. Extract 2-digit sector code
# -----------------------------
df["SectorCode"] = df["2017 NAICS Code"].astype(str).str[:2]

# -----------------------------
# 4. Combine repeated sectors
# -----------------------------
combine_codes = {
    "32": "31",
    "33": "31",
    "45": "44",
    "49": "48"
}
df["SectorCode"] = df["SectorCode"].replace(combine_codes)

# -----------------------------
# 5. Add sector names
# -----------------------------
df["SectorName"] = df["SectorCode"].map(sector_names)

# -----------------------------
# 6. Aggregate SEF + MEF
# -----------------------------
agg_df = df.groupby(["SectorCode", "SectorName"], as_index=False).agg({
    "Supply Chain Emission Factors with Margins": "sum"
})

# -----------------------------
# 7. Define Gaussian with fixed baseline
# -----------------------------
def gaussian_fixed_baseline(x, a, mu, sigma):
    b = min(y)  # baseline fixed to minimum value
    return a * np.exp(-(x - mu)**2 / (2 * sigma**2)) + b

x_numeric = np.arange(len(agg_df))
y = agg_df["Supply Chain Emission Factors with Margins"].values

# Initial guesses: a = peak height, mu = peak index, sigma = width
p0 = [max(y)-min(y), np.argmax(y), 2]

# Fit Gaussian
popt, pcov = curve_fit(gaussian_fixed_baseline, x_numeric, y, p0=p0)
y_fit = gaussian_fixed_baseline(x_numeric, *popt)

# -----------------------------
# 8. Plot scatter + Gaussian fit
# -----------------------------
fig = px.scatter(
    agg_df,
    x="SectorName",
    y="Supply Chain Emission Factors with Margins",
    title="SEF + MEF by Sector with Gaussian Fit (Fixed Baseline)",
    labels={"y": "SEF + MEF"}
)

# Add Gaussian fit line
fig.add_traces(px.line(
    x=agg_df["SectorName"],
    y=y_fit
).data)

fig.data[1].name = "Gaussian Fit (Fixed Baseline)"
fig.show()

# -----------------------------
# 9. Print fitted parameters
# -----------------------------
print("Fitted parameters (fixed baseline):")
print("Amplitude (a) =", popt[0])
print("Peak center (mu) =", popt[1])
print("Width (sigma) =", popt[2])
print("Baseline (b) =", min(y))

Fitted parameters (fixed baseline):
Amplitude (a) = 125.83035383901439
Peak center (mu) = 4.018145786554115
Width (sigma) = 0.42107700515431074
Baseline (b) = 0.252

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("datasets/BTC_USD_full_data.csv")

# Clean the data
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])

df['DayIndex'] = (df['Date'] - df['Date'].min()).dt.days

X = df['DayIndex'].values
y = df['Close'].values

plt.figure(figsize=(14,6))
plt.scatter(df['Date'], y, s=10, label="Data points", alpha=0.7)
plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("Bitcoin Price Data Points")
plt.legend()
plt.show()

# Fit a cubic polynomial (more stable)
coeffs = np.polyfit(X, y, 4)
poly = np.poly1d(coeffs)

plt.figure(figsize=(14,6))

# data points
plt.scatter(df['Date'], y, s=10, alpha=0.6, label="Data points")

# fitted curve
plt.plot(df['Date'], poly(X), color='red', linewidth=2, label="Fit curve (degree 3)")

plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("BTC Data Points with Fitted Curve")
plt.legend()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import RBFInterpolator

# Load and clean data
df = pd.read_csv("datasets/BTC_USD_full_data.csv")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])

df['DayIndex'] = (df['Date'] - df['Date'].min()).dt.days

X = df['DayIndex'].values.reshape(-1, 1)
y = df['Close'].values

# ----- RBF FIT -----
rbf = RBFInterpolator(X, y, kernel='gaussian', epsilon=50)

# Smooth X for smooth curve
X_smooth = np.linspace(X.min(), X.max(), 2000).reshape(-1, 1)
y_smooth = rbf(X_smooth)

# ----- Plot -----
plt.figure(figsize=(14,6))

# Data points
plt.scatter(df['Date'], y, s=10, alpha=0.4, label="Data Points")

# RBF curve
dates_smooth = df['Date'].min() + pd.to_timedelta(X_smooth.flatten(), unit='D')
plt.plot(dates_smooth, y_smooth, color='red', linewidth=2, label="RBF Fit")

plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("RBF Fit for BTC Closing Price")
plt.legend()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

# --- Load and clean data ---
df = pd.read_csv("datasets/BTC_USD_full_data.csv")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])

# --- Apply Gaussian smoothing ---
sigma = 20    # Adjust smoothness (10 = sharp, 20 = smooth, 40 = very smooth)
y_smooth = gaussian_filter1d(df['Close'].values, sigma=sigma)

# --- Plot ---
plt.figure(figsize=(14,6))

# Original data
plt.scatter(df['Date'], df['Close'], s=8, alpha=0.3, label="Raw Data")

# Smoothed curve
plt.plot(df['Date'], y_smooth, color='red', linewidth=2, label=f"Gaussian Smoothing (sigma={sigma})")

plt.xlabel("Date")
plt.ylabel("BTC Closing Price (USD)")
plt.title("Gaussian Smoothing on BTC Closing Price")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

Fitting the Supply Chain Emission with Margin Factors¶

Guasian Smoothing¶

References¶