# import python modules
import fabmodules as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
from matplotlib.ticker import FuncFormatter

# set parameters
output_path = fm.output_path = "outputs/"
today = fm.today = datetime.today().strftime("%Y-%m-%d")

# colors and more
traffic_strong="grey,red,orange,yellow,green".split(",")
traffic_soft=np.array([(231,231,242),(247,149,148),(247,180,149),(247,220,149),(144,240,156)]) / 255 # grey, red, orange, yellow, green
blue_scale=np.array([(231,231,242),(230,245,255),(189,227,251),(148,209,247),(81,172,230),(16,106,166)]) / 255 # grey, blue 20, 40, 60, 80, 100
green_scale=np.array([(231,231,242),(228,252,231),(196,245,202),(144,240,156),(84,209,95),(18,179,47)]) /255 # grey, green 20, 40, 60, 80, 100

# read results from session 2
url = output_path + "fablabcounts.xlsx" 
result = pd.read_excel(url)
# country_code Not-Available should be replaced by "NA" (Namibia)
result["country_code"]=result["country_code"].fillna("NA")
fm.log("fablabcounts",url,result)

2025-12-17T15:21Z fablabcounts outputs/fablabcounts.xlsx (1689, 3)

# check dataframe structure
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1689 entries, 0 to 1688
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country_code  1689 non-null   object
 1   date_code     1689 non-null   object
 2   count         1689 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 39.7+ KB

# check result for single country
country_code = "FR"
result[result["country_code"]==country_code]

# Read the country list
url = "http://api.geonames.org/countryInfo?username=fab23workshop"
countries = pd.read_xml(url, parser="etree")

# Set country_code = "NA" where countryName == "Namibia"
countries.loc[countries["countryName"] == "Namibia", "countryCode"] = "NA"

# Set continent = "NA" where continentName == "North America"
countries.loc[countries["continentName"] == "North America", "continent"] = "NA"

fm.countries = countries
fm.log("countries ",url,countries)

2025-12-17T15:21Z countries  http://api.geonames.org/countryInfo?username=fab23workshop (250, 18)

# function to convert date_code into numeric
# function created by chatGPT
def convert_dc(date_code):
    base=int(date_code[0:2])
    frac = .0 if date_code[2]=="m" else .5
    return base + frac

convert_dc("19u")

19.5

# add numeric date codes to result
result['x']=result['date_code'].apply(convert_dc)

# check result for single country
result[result["country_code"]==country_code]

# function generated by ChatGPT via interactive process
def compute_trend_coeffs(df):
    """
    df must contain:
      - country_code
      - x (numeric axis value)
      - count (y-value)
    Returns a dataframe with:
      country_code | slope | intercept
    """
    rows = []

    for c in sorted(df["country_code"].dropna().unique()):
        data = df[df["country_code"] == c]

        # Need at least 2 points for a linear fit
        if len(data) >= 2:
            m, b = np.polyfit(data["x"], data["count"], 1)
        else:
            m, b = np.nan, np.nan

        rows.append({"country_code": c, "slope": m, "intercept": b})

    return pd.DataFrame(rows)

# calculate coefficients and show
coeff_df = compute_trend_coeffs(result)
coeff_df

# plot counts plus trend line
def plot_counts(df, trend_df, country=None):
    """
    df: dataframe with ['country_code', 'date_code', 'x', 'count']
    trend_df: dataframe with ['country_code', 'slope', 'intercept']
    country: optional filter. If None, counts are summed across all countries.
    """

    # ---- Select data ----
    if country:
        data = df[df["country_code"] == country].copy()
    else:
        # Sum counts across countries for each date
        data = (
            df.groupby(["date_code", "x"], as_index=False)["count"]
              .sum()
        )

    # ---- Sort by date ----
    data = data.sort_values("x")

    # ---- Retrieve trend coefficients ----
    if country:
        row = trend_df[trend_df["country_code"] == country]
    else:
        # global trend: compute on the fly
        if len(data) >= 2:
            m, b = np.polyfit(data["x"], data["count"], 1)
        else:
            m, b = np.nan, np.nan
        row = None

    if row is not None and not row.empty:
        m = row["slope"].iloc[0]
        b = row["intercept"].iloc[0]

    # ---- Plot figure ----
    plt.figure(figsize=(10, 5))

    # Main line
    plt.plot(data["date_code"], data["count"], marker="o")

    # Annotate integer values
    for x_pos, y in zip(data["date_code"], data["count"]):
        plt.text(x_pos, y, str(int(y)), ha='center', va='bottom')

    # ---- Trend line ----
    if not np.isnan(m):
        x_vals = np.array(data["x"])
        y_trend = m * x_vals + b
        plt.plot(data["date_code"], y_trend, linestyle="--", label="Trend line")

    # ---- Axis formatting ----
    plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x)}"))

    plt.xlabel(f"year (m = medio, u = ultimo)\n(chart prepared {today}) ")
    plt.ylabel('Fablab counts \n(any activity_status except "closed")')

    countryName = fm.get_country(country)
    title = f"{countryName if country else 'World'}: Fablab counts over time"
    plt.title(title)
    plt.xticks(rotation=45)

    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    # ---- Save & show ----
    aux = "ct."+ country.lower() if country else "world"
    plt.savefig(output_path + aux + "_flcounts_over_time.png", dpi=300, bbox_inches="tight")
    plt.show()

# Create world fablabs count over time plus trend 
result_df = result
plot_counts(result_df, coeff_df, country=None)

# Create single country fablabs count over time plus trend 
country_code = "FR"
result_df = result
plot_counts(result_df, coeff_df, country=country_code)

# Create fablabs count over time plus trend for all countries
df = result
for c in sorted(df["country_code"].dropna().unique()):
    print("Plotting:", c)   # optional
    plot_counts(df, coeff_df, country=c)
    #plot_counts_with_trend(df, coeff_df, country=c)

	country_code	date_code	count
501	FR	19m	213
502	FR	19u	217
503	FR	20m	225
504	FR	20u	234
505	FR	21m	234
506	FR	21u	235
507	FR	22m	236
508	FR	22u	242
509	FR	23m	242
510	FR	23u	245
511	FR	24m	275
512	FR	24u	279
513	FR	25m	281

	country_code	date_code	count	x
501	FR	19m	213	19.0
502	FR	19u	217	19.5
503	FR	20m	225	20.0
504	FR	20u	234	20.5
505	FR	21m	234	21.0
506	FR	21u	235	21.5
507	FR	22m	236	22.0
508	FR	22u	242	22.5
509	FR	23m	242	23.0
510	FR	23u	245	23.5
511	FR	24m	275	24.0
512	FR	24u	279	24.5
513	FR	25m	281	25.0

	country_code	slope	intercept
0	AE	3.846154e-01	-0.846154
1	AF	3.822909e-17	1.000000
2	AL	2.743091e-16	2.000000
3	AM	7.692308e-02	-0.628205
4	AR	1.197802e+00	-9.274725
...	...	...	...
137	VE	6.593407e-02	1.472527
138	VN	1.120879e+00	-6.582418
139	XK	NaN	NaN
140	YT	1.371545e-16	1.000000
141	ZA	-2.527473e-01	13.021978

DataScience Session 3: Fitting¶

Synopsis¶

Resources¶

Assignment¶

A. Research ideas¶

B. Research planning and design¶

C. Data collection¶

Read the fablab timelines¶

Read countries dataset¶

D. Data processing¶

Convert the data_code to numeric, add as extra field "x"¶

E. Data Study and Analysis¶

Calculate linear fit coefficients for all countries¶

F. Data Publishing and Access¶

Create function for plotting with trend lines¶

Line chart world fablabs with trend¶

Line chart single country fablabs with trend¶

Line charts with trend of all countries (remove cell output before committing)¶

G. Data Preservation¶

H. Data Re-use¶

Evaluation and Follow-up¶

Follow-up¶

Review¶

	country_code	date_code	count
501	FR	19m	213
502	FR	19u	217
503	FR	20m	225
504	FR	20u	234
505	FR	21m	234
506	FR	21u	235
507	FR	22m	236
508	FR	22u	242
509	FR	23m	242
510	FR	23u	245
511	FR	24m	275
512	FR	24u	279
513	FR	25m	281

	country_code	date_code	count
501	FR	19m	213
502	FR	19u	217
503	FR	20m	225
504	FR	20u	234
505	FR	21m	234
506	FR	21u	235
507	FR	22m	236
508	FR	22u	242
509	FR	23m	242
510	FR	23u	245
511	FR	24m	275
512	FR	24u	279
513	FR	25m	281

	country_code	date_code	count
501	FR	19m	213
502	FR	19u	217
503	FR	20m	225
504	FR	20u	234
505	FR	21m	234
506	FR	21u	235
507	FR	22m	236
508	FR	22u	242
509	FR	23m	242
510	FR	23u	245
511	FR	24m	275
512	FR	24u	279
513	FR	25m	281