# import python modules
import fabmodules as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from datetime import datetime

# set parameters
output_path = fm.output_path = "outputs/"
today = fm.today = datetime.today().strftime("%Y-%m-%d")

# colors and more
traffic_strong="grey,red,orange,yellow,green".split(",")
traffic_soft=np.array([(231,231,242),(247,149,148),(247,180,149),(247,220,149),(144,240,156)]) / 255 # grey, red, orange, yellow, green
blue_scale=np.array([(231,231,242),(230,245,255),(189,227,251),(148,209,247),(81,172,230),(16,106,166)]) / 255 # grey, blue 20, 40, 60, 80, 100
green_scale=np.array([(231,231,242),(228,252,231),(196,245,202),(144,240,156),(84,209,95),(18,179,47)]) /255 # grey, green 20, 40, 60, 80, 100

# Cesar García helped to figure this out a couple of years ago
# list the datasets directory
history = sorted(os.listdir("datasets"))
history = sorted(os.listdir("datasets"))
history

['.gitignore',
 '.ipynb_checkpoints',
 '2019.07.04_labs.json',
 '2019.12.27_labs.json',
 '2020.07.01_labs.json',
 '2020.12.31_labs.json',
 '2021.07.06_labs.json',
 '2021.12.31_labs.json',
 '2022.07.03_labs.json',
 '2022.12.31_labs.json',
 '2023.06.30_labs.json',
 '2023.12.31_labs.json',
 '2024.06.30_labs.json',
 '2024.12.31_labs.json',
 '2025.06.30_labs.json']

# if labs.json files are not well-formatted and cause errors:
# they will be skipped (see [0,0] in log messages)

# read and clean a single labs.json file
def read_single_labs(filename):
    date_code = filename[2:4] + ("m" if filename[5:7] == "06" or filename[5:7] == "07" else "u")
    # Read the fablab register
    url = "datasets/"+filename
    try:
        df = pd.read_json(url) # we read all fields as activity_status did not exist before 19u
        if not ('activity_status' in df.columns):
            df['activity_status'] = "unknown"
        df['activity_status'] = df['activity_status'].replace('', pd.NA)
        df['activity_status'] = df['activity_status'].fillna('unknown')
        df['kind_name'] = df['kind_name'].replace('', pd.NA)
        df['kind_name'] = df['kind_name'].fillna('unknown')
        df["country_code"] = df["country_code"].str.upper()
        df['date_code'] = date_code
    except:
        df = pd.DataFrame()
    fm.log(date_code,url,df)
    return df

# we restrict the fields to the most relevant ones
# we always can extend this fieldlist later

fieldlist = "date_code,id,slug,name,activity_status,kind_name,country_code".split(",")
combined = pd.DataFrame()
for each in history:
    if "labs.json" in each:
        df = read_single_labs(each)
        if df.shape[0] > 0:
            aux = df[fieldlist]
            combined = pd.concat([combined, aux])
fm.log("combined","dataframe",combined)

2025-12-18T12:32Z 19m datasets/2019.07.04_labs.json (1729, 24)
2025-12-18T12:32Z 19u datasets/2019.12.27_labs.json (1849, 24)
2025-12-18T12:32Z 20m datasets/2020.07.01_labs.json (1933, 24)
2025-12-18T12:32Z 20u datasets/2020.12.31_labs.json (1998, 24)
2025-12-18T12:32Z 21m datasets/2021.07.06_labs.json (2029, 24)
2025-12-18T12:32Z 21u datasets/2021.12.31_labs.json (2039, 24)
2025-12-18T12:32Z 22m datasets/2022.07.03_labs.json (2069, 24)
2025-12-18T12:32Z 22u datasets/2022.12.31_labs.json (2117, 24)
2025-12-18T12:32Z 23m datasets/2023.06.30_labs.json (2144, 24)
2025-12-18T12:32Z 23u datasets/2023.12.31_labs.json (2221, 24)
2025-12-18T12:32Z 24m datasets/2024.06.30_labs.json (2548, 24)
2025-12-18T12:32Z 24u datasets/2024.12.31_labs.json (2597, 24)
2025-12-18T12:32Z 25m datasets/2025.06.30_labs.json (2636, 24)
2025-12-18T12:32Z combined dataframe (27909, 7)

combined

# Read the country list
url = "http://api.geonames.org/countryInfo?username=fab23workshop"
countries = pd.read_xml(url, parser="etree")

# Set country_code = "NA" where countryName == "Namibia"
countries.loc[countries["countryName"] == "Namibia", "countryCode"] = "NA"

# Set continent = "NA" where continentName == "North America"
countries.loc[countries["continentName"] == "North America", "continent"] = "NA"

fm.log("countries ",url,countries)

2025-12-18T12:33Z countries  http://api.geonames.org/countryInfo?username=fab23workshop (250, 18)

countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   countryCode       250 non-null    object 
 1   countryName       250 non-null    object 
 2   isoNumeric        250 non-null    int64  
 3   isoAlpha3         250 non-null    object 
 4   fipsCode          247 non-null    object 
 5   continent         250 non-null    object 
 6   continentName     250 non-null    object 
 7   capital           241 non-null    object 
 8   areaInSqKm        250 non-null    float64
 9   population        250 non-null    int64  
 10  currencyCode      249 non-null    object 
 11  languages         247 non-null    object 
 12  geonameId         250 non-null    int64  
 13  west              250 non-null    float64
 14  north             250 non-null    float64
 15  east              250 non-null    float64
 16  south             250 non-null    float64
 17  postalCodeFormat  177 non-null    object 
dtypes: float64(5), int64(3), object(10)
memory usage: 35.3+ KB

# Count operational fablabs (all activity status except "closed") 
df = combined
df_filtered = df[df["activity_status"] != "closed"]

result = (
    df_filtered
    .groupby(["country_code", "date_code"])
    .size()
    .reset_index(name="count")
)

fm.log("result","",result)

2025-12-18T12:33Z result  (1689, 3)

country_code = "FR" # change if you like
result[result["country_code"]==country_code]

# make countries available in fabmodules
fm.countries = countries

# Create line chart for World 
fm.plot_counts(result)

# create line chart for single country
country_code = "FR" # change if you like
df = result
print("Plotting:", country_code)  
fm.plot_counts(df, country=country_code)

Plotting: FR

# create line charts for all countries
# BE CAREFUL: REMOVE CELL OUTPUT BEFORE UPDATING REPOSITORY!!!
df = result
for c in sorted(df["country_code"].dropna().unique()):
    print("Plotting:", c)   # optional
    fm.plot_counts(df, country=c)

# save result for use in session 3
with pd.ExcelWriter(output_path + "fablabcounts.xlsx", engine="openpyxl") as writer:
    result.to_excel(writer, index=False)

	date_code	id	slug	name	activity_status	kind_name	country_code
0	19m	530	kaasfabriek	Kaasfabriek \| FabLab regio Alkmaar	unknown	fab_lab	NL
1	19m	1001	vmssfablab	Valley Middle School of STEM FAB LAB	unknown	fab_lab	US
2	19m	1193	fablab276valdereuil	FabLab 276 Val-de-Reuil	unknown	fab_lab	FR
3	19m	17	ping	PiNG	unknown	fab_lab	FR
4	19m	20	fablabinsastrasbourg	FabLab INSA Strasbourg	unknown	fab_lab	FR
...	...	...	...	...	...	...	...
2631	25m	13	fablabegypt	Fab Lab Egypt	active	fab_lab	EG
2632	25m	2348	martiguesFabLab	Martigues'FabLab	active	fab_lab	FR
2633	25m	1730	biscastmanfablab	BISCAST ManFabLab	active	fab_lab	PH
2634	25m	1587	fablabjordan	The Makerspace (previously TechWorks Amman)	active	fab_lab	JO
2635	25m	2210	fablabaq	FabLab L'Aquila	active	fab_lab	IT

DataScience Session 2: Tools¶

Synopsis¶

Resources¶

Assignment¶

A. Research ideas¶

B. Research planning and design¶

C. Data collection¶

List the archived fablab files¶

Read the fablab files and combine into a single dataset (add a date_code to each record)¶

Read the countries from geonames.org¶

D. Data processing¶

Count operational fablabs by country¶

Check result for single country (use iso-2 code, uppercase)¶

E. Data Study and Analysis¶

Line chart world fablabs¶

Line chart single country fablab¶

Line charts of all countries (remove cell output before committing)¶

F. Data Publishing and Access¶

G. Data Preservation¶

Save result file for later use¶

H. Data Re-use¶

Evaluation and Follow-up¶

Follow-up¶

Review¶

	country_code	date_code	count
501	FR	19m	213
502	FR	19u	217
503	FR	20m	225
504	FR	20u	234
505	FR	21m	234
506	FR	21u	235
507	FR	22m	236
508	FR	22u	242
509	FR	23m	242
510	FR	23u	245
511	FR	24m	275
512	FR	24u	279
513	FR	25m	281

	country_code	date_code	count
501	FR	19m	213
502	FR	19u	217
503	FR	20m	225
504	FR	20u	234
505	FR	21m	234
506	FR	21u	235
507	FR	22m	236
508	FR	22u	242
509	FR	23m	242
510	FR	23u	245
511	FR	24m	275
512	FR	24u	279
513	FR	25m	281

	country_code	date_code	count
501	FR	19m	213
502	FR	19u	217
503	FR	20m	225
504	FR	20u	234
505	FR	21m	234
506	FR	21u	235
507	FR	22m	236
508	FR	22u	242
509	FR	23m	242
510	FR	23u	245
511	FR	24m	275
512	FR	24u	279
513	FR	25m	281