# import python modules
import fabmodules as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# set parameters
    
# colors and more
traffic_strong="grey,red,orange,yellow,green".split(",")
traffic_soft=np.array([(231,231,242),(247,149,148),(247,180,149),(247,220,149),(144,240,156)]) / 255 # grey, red, orange, yellow, green
blue_scale=np.array([(231,231,242),(230,245,255),(189,227,251),(148,209,247),(81,172,230),(16,106,166)]) / 255 # grey, blue 20, 40, 60, 80, 100
green_scale=np.array([(231,231,242),(228,252,231),(196,245,202),(144,240,156),(84,209,95),(18,179,47)]) /255 # grey, green 20, 40, 60, 80, 100

# Read the fablab register
url = "https://fablabs.io/labs.json?class=btn+btn-primary"
fablabs = pd.read_json(url)
fm.log("fablabs ",url,fablabs)

2025-12-18T13:25Z fablabs  https://fablabs.io/labs.json?class=btn+btn-primary (2696, 23)

# Clean fablabs dataset

# set missing activity status to "unknown"
fablabs['activity_status'] = (fablabs['activity_status'].replace('', pd.NA).fillna('unknown'))

# ensure that country_code always is uppercase
fablabs["country_code"] = fablabs["country_code"].str.upper()

# skip fablabs with activity_status == "closed"
fablabs = fablabs[~(fablabs["activity_status"] == "closed")]

fm.log("fablabs ","cleaning",fablabs)

2025-12-18T13:25Z fablabs  cleaning (2603, 23)

# Read the country list
url = "http://api.geonames.org/countryInfo?username=fab23workshop"
countries = pd.read_xml(url, parser="etree")
fm.log("countries ",url,countries)

2025-12-18T13:26Z countries  http://api.geonames.org/countryInfo?username=fab23workshop (250, 18)

# Clean country list

# Set country_code = "NA" where countryName == "Namibia"
countries.loc[countries["countryName"] == "Namibia", "countryCode"] = "NA"

# Set continent = "NA" where continentName == "North America"
countries.loc[countries["continentName"] == "North America", "continent"] = "NA"

fm.log("countries ","cleaning",countries)

2025-12-18T13:26Z countries  cleaning (250, 18)

# Count fablabs per country and add as fablabs_c to countries dataset

aux = fablabs.groupby('country_code').size() 
aux.name='fablabs_c'
countries = countries.merge(aux,left_on = 'countryCode',right_index = True, how = 'left')
countries['fablabs_c'] = countries['fablabs_c'].fillna(0)

df = countries
# Count fablabs_c * M / population
df['fablabs_cpop'] = (df['fablabs_c'] * 1000000)/ df['population']
    
# Count fablabs_c * 10,000 / area
df['fablabs_carea'] = df['fablabs_c'] * 10000 / df['areaInSqKm']

countries=df
fm.log("countries ","processed",countries)

2025-12-18T13:26Z countries  processed (250, 21)

df = countries
# Filter
df_filtered = df[df["fablabs_c"] > 0]

# Histogram of fablab_cpop
plt.hist(df_filtered["fablabs_cpop"].dropna())
plt.xlabel("fablabs_cpop")
plt.ylabel("Frequency")
plt.title("Histogram of fablabs_cpop (countries with fablabs_c > 0)")
plt.show()

df_filtered = df[df["fablabs_c"] > 0]

mean_val = df_filtered["fablabs_cpop"].mean()
median_val = df_filtered["fablabs_cpop"].median()
std_val = df_filtered["fablabs_cpop"].std()

mean_val, median_val, std_val

(np.float64(1.7844279691128933), 0.6484243288808196, 4.904440202652281)

df = countries['countryCode,fablabs_c,fablabs_cpop,population'.split(',')]
df.to_csv("outputs/countries.csv", index=False)
fm.log("countries","selection",df)

2025-12-18T13:27Z countries selection (250, 4)

# Filter: keep only countries with >= 1,000,000 population AND fablabs_c > 0
df_filtered = df[(df["population"] >= 1_000_000) & (df["fablabs_c"] > 0)]

# Now you can analyze fablabs_cpop distribution
stats = {
    "mean": df_filtered["fablabs_cpop"].mean(),
    "median": df_filtered["fablabs_cpop"].median(),
    "std": df_filtered["fablabs_cpop"].std(),
    "skew": df_filtered["fablabs_cpop"].skew()
}

print(stats)

{'mean': np.float64(0.7967723969002516), 'median': 0.497799527975779, 'std': 0.8733868149300991, 'skew': np.float64(1.8342123054356458)}

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# For now, just filter fablabs_c > 0
df_filtered = df[df["fablabs_c"] > 0]

# Extract the series
values = df_filtered["fablabs_cpop"].dropna()

# --- Histogram (normal scale) ---
plt.figure(figsize=(8,5))
sns.histplot(values, bins=30, kde=False)
plt.title("Histogram of fablabs_cpop (normal scale)")
plt.xlabel("fablabs_cpop")
plt.ylabel("Count")
plt.show()

# --- Histogram (log scale) ---
plt.figure(figsize=(8,5))
sns.histplot(values, bins=30, kde=False)
plt.yscale("log")
plt.title("Histogram of fablabs_cpop (log scale)")
plt.xlabel("fablabs_cpop")
plt.ylabel("Count (log scale)")
plt.show()

# --- Histogram with std-based bins ---
import numpy as np
mean = values.mean()
std = values.std()
bin_edges = np.arange(0, values.max() + std, std)

plt.figure(figsize=(8,5))
sns.histplot(values, bins=bin_edges, kde=False)
plt.title("Histogram of fablabs_cpop (std-based bins)")
plt.xlabel("fablabs_cpop")
plt.ylabel("Count")
plt.show()

# --- KDE plot ---
plt.figure(figsize=(8,5))
sns.kdeplot(values, fill=True)
plt.title("KDE plot of fablabs_cpop")
plt.xlabel("fablabs_cpop")
plt.show()

# --- Boxplot ---
plt.figure(figsize=(6,4))
sns.boxplot(x=values)
plt.title("Boxplot of fablabs_cpop")
plt.xlabel("fablabs_cpop")
plt.show()

Session 5: Probability¶

Synopsis¶

Resources¶

Assignment¶

Review¶

A. Research ideas¶

B. Research planning and design¶

C. Data collection¶

D. Data processing¶

E. Data Study and Analysis¶

Conclusion¶

What would be interesting to learn more from the SDG profile?¶

F. Data Publishing and Access¶

G. Data Preservation¶

H. Data Re-use¶

Evaluation and Follow-up¶

Follow-up¶

Review¶