Data Science - Week 2: Machine Learning Assignment (Museum Artifacts)ΒΆ
Student: Sedat YalΓ§Δ±n
Dataset: Metropolitan Museum of Art (The Met) Collection API
Source: https://metmuseum.github.io/ (470,000+ artifacts, free API)
Task: Classify museum artifacts by category and predict missing parts
Overview: Machine Learning with Museum ArtifactsΒΆ
Problem: Recognize and classify museum artifacts by category
Approach:
- Classify museum artifacts by category using artifact metadata
- Predict artifact type using neural network (sculpture, painting, ceramic, jewelry, etc.)
- Missing part prediction: Predict which category a new artifact belongs to
Museum Artifact Categories: 0. Sculpture
- Painting
- Ceramic
- Jewelry
- Metalwork
- Stone Artifact
- Glass Artifact
- Textile
- Wood Artifact
- Inscription/Tablet
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
# Fetch real museum data using Metropolitan Museum of Art (The Met) API
# API: https://metmuseum.github.io/
# 470,000+ artifacts, free and open access
print("Fetching data from Metropolitan Museum of Art (The Met) API...")
print("Source: https://metmuseum.github.io/")
print("Note: Real museum data is required for this assignment\n")
def fetch_met_objects(limit=1000):
"""Fetch artifact information from Met Museum API"""
try:
url = "https://collectionapi.metmuseum.org/public/collection/v1/objects"
response = requests.get(url, timeout=10)
if response.status_code == 200:
data = response.json()
all_object_ids = data.get('objectIDs', [])
# Sample randomly from all IDs to get diverse departments
if len(all_object_ids) > limit:
object_ids = random.sample(all_object_ids, limit)
else:
object_ids = all_object_ids[:limit]
print(f"β Found {len(object_ids)} artifact IDs (sampled from {len(all_object_ids)} total)")
artifacts = []
for i, obj_id in enumerate(object_ids):
try:
obj_url = f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{obj_id}"
obj_response = requests.get(obj_url, timeout=5)
if obj_response.status_code == 200:
obj_data = obj_response.json()
# Only include artifacts with valid department
dept = obj_data.get('department', '')
if dept and dept != '':
artifacts.append({
'objectID': obj_data.get('objectID'),
'title': obj_data.get('title', 'Unknown'),
'department': dept,
'classification': obj_data.get('classification', 'Unknown'),
'medium': obj_data.get('medium', 'Unknown'),
'objectBeginDate': obj_data.get('objectBeginDate', 0),
'objectEndDate': obj_data.get('objectEndDate', 0),
'culture': obj_data.get('culture', 'Unknown'),
'dimensions': str(obj_data.get('dimensions', '')),
'tags': str(obj_data.get('tags', []))
})
if (i + 1) % 100 == 0:
print(f" Processed {i + 1} artifacts... ({len(artifacts)} valid)")
except:
continue
return pd.DataFrame(artifacts)
else:
return None
except Exception as e:
print(f"API error: {e}")
return None
# Fetch data from Met Museum API
# Random sampling ensures diverse categories
df = fetch_met_objects(limit=1000)
# Veri iΕleme
try:
if df is not None and len(df) > 0:
print(f"\nβ Successfully loaded {len(df)} artifacts!")
# Map departments to categories
department_mapping = {
'American Decorative Arts': 'Metalwork',
'The American Wing': 'Metalwork',
'Ancient Near Eastern Art': 'Stone Artifact',
'Arms and Armor': 'Metalwork',
'Arts of Africa, Oceania, and the Americas': 'Sculpture',
'Asian Art': 'Ceramic',
'The Cloisters': 'Stone Artifact',
'Costume Institute': 'Textile',
'Drawings and Prints': 'Painting',
'Egyptian Art': 'Stone Artifact',
'European Paintings': 'Painting',
'European Sculpture and Decorative Arts': 'Sculpture',
'Greek and Roman Art': 'Sculpture',
'Islamic Art': 'Ceramic',
'The Robert Lehman Collection': 'Painting',
'Medieval Art': 'Stone Artifact',
'Musical Instruments': 'Wood Artifact',
'Photographs': 'Painting',
'Modern and Contemporary Art': 'Painting'
}
df['category'] = df['department'].map(department_mapping)
df = df[df['category'].notna()]
# Show department distribution for debugging
print(f"\nDepartment distribution:")
dept_counts = df['department'].value_counts()
for dept, count in dept_counts.items():
category = department_mapping.get(dept, 'Unknown')
print(f" {dept}: {count} artifacts -> {category}")
if len(df) == 0:
raise ValueError("Could not categorize artifacts!")
# Create features
features = []
labels = []
for idx, row in df.iterrows():
try:
begin_date = float(row['objectBeginDate']) if not pd.isna(row['objectBeginDate']) and row['objectBeginDate'] != 0 else 0.0
end_date = float(row['objectEndDate']) if not pd.isna(row['objectEndDate']) and row['objectEndDate'] != 0 else 0.0
age = (2024 - (begin_date + end_date) / 2) if (begin_date + end_date) > 0 else 1000.0
feature_vector = [
begin_date / 1000.0,
end_date / 1000.0,
age / 1000.0,
hash(str(row['department'])) % 100 / 100.0,
hash(str(row['classification'])) % 100 / 100.0,
hash(str(row['medium'])) % 100 / 100.0,
hash(str(row['culture'])) % 100 / 100.0,
len(str(row['dimensions'])) / 100.0,
len(str(row['title'])) / 100.0,
len(str(row['tags'])) / 100.0
]
features.append(feature_vector)
labels.append(row['category'])
except Exception as e:
continue
if len(features) == 0:
raise ValueError("HiΓ§ ΓΆzellik oluΕturulamadΔ±!")
X = np.array(features)
le_category = LabelEncoder()
y = le_category.fit_transform(labels)
class_names = le_category.classes_.tolist()
feature_names = [
'Begin Date', 'End Date', 'Age', 'Department',
'Classification', 'Medium', 'Culture', 'Dimensions',
'Title Length', 'Tag Count'
]
print(f"\nDataset: {X.shape[0]} artifacts, {X.shape[1]} features")
print(f"Categories ({len(class_names)}): {', '.join(class_names)}")
# Need at least 2 categories
if len(class_names) < 2:
print(f"\nβ Warning: Only found 1 category: {class_names}")
print("Trying to fetch more artifacts with different departments...")
# Try fetching more with different approach
raise ValueError(f"Only 1 category found ({class_names[0]})! Need at least 2 categories. Try increasing the limit or check API availability.")
else:
raise ValueError("Data could not be loaded from API!")
except Exception as e:
print(f"β Error: {e}")
print("\n" + "="*70)
print("ERROR: Could not fetch data from Met Museum API")
print("="*70)
print("\nThis assignment requires real museum data from the Met Museum API.")
print("Please check your internet connection and try again.")
print("\nAPI URL: https://collectionapi.metmuseum.org/public/collection/v1/objects")
print("="*70)
raise
Fetching data from Metropolitan Museum of Art (The Met) API... Source: https://metmuseum.github.io/ Note: Real museum data is required for this assignment β Found 1000 artifact IDs (sampled from 498912 total) Processed 100 artifacts... (99 valid) Processed 200 artifacts... (155 valid) Processed 300 artifacts... (161 valid) Processed 400 artifacts... (161 valid) Processed 500 artifacts... (161 valid) Processed 600 artifacts... (161 valid) Processed 700 artifacts... (161 valid) Processed 800 artifacts... (161 valid) Processed 900 artifacts... (161 valid) Processed 1000 artifacts... (161 valid) β Successfully loaded 161 artifacts! Department distribution: Drawings and Prints: 52 artifacts -> Painting Asian Art: 15 artifacts -> Ceramic Photographs: 15 artifacts -> Painting Costume Institute: 14 artifacts -> Textile Greek and Roman Art: 13 artifacts -> Sculpture Egyptian Art: 13 artifacts -> Stone Artifact European Sculpture and Decorative Arts: 11 artifacts -> Sculpture The American Wing: 6 artifacts -> Metalwork Islamic Art: 5 artifacts -> Ceramic Arms and Armor: 4 artifacts -> Metalwork Modern and Contemporary Art: 3 artifacts -> Painting Musical Instruments: 2 artifacts -> Wood Artifact European Paintings: 1 artifacts -> Painting The Cloisters: 1 artifacts -> Stone Artifact Dataset: 155 artifacts, 10 features Categories (7): Ceramic, Metalwork, Painting, Sculpture, Stone Artifact, Textile, Wood Artifact
Data VisualizationΒΆ
Visualize the features of museum artifacts
# Show feature distributions
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()
for i in range(min(10, X.shape[1])):
axes[i].hist(X[:, i], bins=30, alpha=0.7, edgecolor='black')
axes[i].set_title(feature_names[i], fontsize=9)
axes[i].set_xlabel('Value')
axes[i].set_ylabel('Frequency')
axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Class distribution
plt.figure(figsize=(10, 5))
unique, counts = np.unique(y, return_counts=True)
plt.bar([class_names[i] for i in unique], counts, color='steelblue', edgecolor='black')
plt.title('Museum Artifact Category Distribution')
plt.xlabel('Category')
plt.ylabel('Number of Artifacts')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
Train-Test SplitΒΆ
Split data into training and test sets
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape[0]} artifacts")
print(f"Test set: {X_test.shape[0]} artifacts")
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"\nFeature scaling applied (mean=0, std=1)")
Training set: 124 artifacts Test set: 31 artifacts Feature scaling applied (mean=0, std=1)
Neural Network ModelΒΆ
Architecture:
- Input: 10 features (artifact characteristics)
- Hidden layers: 2 layers with 64 and 32 neurons
- Activation: ReLU (Rectified Linear Unit)
- Output: 10 neurons (one for each category)
- Optimizer: Adam (Adaptive Moment Estimation)
# Create MLP Classifier
model = MLPClassifier(
hidden_layer_sizes=(64, 32), # 2 hidden layers
activation='relu', # ReLU activation
solver='adam', # Adam optimizer
alpha=0.001, # L2 regularization
learning_rate='adaptive', # Adaptive learning rate
max_iter=300, # Maximum iterations
random_state=42,
early_stopping=True, # Stop if no improvement
validation_fraction=0.1, # 10% for validation
verbose=True
)
print("Training neural network...")
model.fit(X_train_scaled, y_train)
print(f"\nTraining completed!")
print(f"Number of iterations: {model.n_iter_}")
print(f"Loss: {model.loss_:.4f}")
Training neural network... Iteration 1, loss = 1.97773564 Validation score: 0.153846 Iteration 2, loss = 1.94753069 Validation score: 0.230769 Iteration 3, loss = 1.91809616 Validation score: 0.230769 Iteration 4, loss = 1.88944279 Validation score: 0.230769 Iteration 5, loss = 1.86158779 Validation score: 0.230769 Iteration 6, loss = 1.83466099 Validation score: 0.230769 Iteration 7, loss = 1.80837993 Validation score: 0.307692 Iteration 8, loss = 1.78269261 Validation score: 0.307692 Iteration 9, loss = 1.75742629 Validation score: 0.307692 Iteration 10, loss = 1.73254434 Validation score: 0.307692 Iteration 11, loss = 1.70828379 Validation score: 0.384615 Iteration 12, loss = 1.68466372 Validation score: 0.461538 Iteration 13, loss = 1.66125862 Validation score: 0.538462 Iteration 14, loss = 1.63794914 Validation score: 0.538462 Iteration 15, loss = 1.61471577 Validation score: 0.615385 Iteration 16, loss = 1.59157251 Validation score: 0.615385 Iteration 17, loss = 1.56844340 Validation score: 0.615385 Iteration 18, loss = 1.54520056 Validation score: 0.615385 Iteration 19, loss = 1.52184493 Validation score: 0.615385 Iteration 20, loss = 1.49840441 Validation score: 0.615385 Iteration 21, loss = 1.47510507 Validation score: 0.615385 Iteration 22, loss = 1.45182601 Validation score: 0.615385 Iteration 23, loss = 1.42849938 Validation score: 0.615385 Iteration 24, loss = 1.40501116 Validation score: 0.615385 Iteration 25, loss = 1.38140452 Validation score: 0.615385 Iteration 26, loss = 1.35750906 Validation score: 0.615385 Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping. Training completed! Number of iterations: 26 Loss: 1.3575
Model EvaluationΒΆ
Compare predictions with actual values
# Predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
# Metrics
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
print("=" * 70)
print("MODEL PERFORMANCE")
print("=" * 70)
print(f"\nTraining Set Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"Test Set Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
# Baseline: random guessing
baseline_acc = 1.0 / 10 # 10 classes
print(f"\nBaseline (random): {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")
print(f"Improvement: {test_acc - baseline_acc:.4f} ({(test_acc - baseline_acc)*100:.2f}% better)")
print("\n" + "=" * 70)
print("CLASSIFICATION REPORT")
print("=" * 70)
print(classification_report(y_test, y_test_pred, target_names=class_names))
======================================================================
MODEL PERFORMANCE
======================================================================
Training Set Accuracy: 0.6452 (64.52%)
Test Set Accuracy: 0.5806 (58.06%)
Baseline (random): 0.1000 (10.00%)
Improvement: 0.4806 (48.06% better)
======================================================================
CLASSIFICATION REPORT
======================================================================
precision recall f1-score support
Ceramic 0.29 0.50 0.36 4
Metalwork 0.00 0.00 0.00 2
Painting 0.81 0.93 0.87 14
Sculpture 0.00 0.00 0.00 5
Stone Artifact 0.50 0.33 0.40 3
Textile 1.00 0.67 0.80 3
Wood Artifact 0.00 0.00 0.00 0
accuracy 0.58 31
macro avg 0.37 0.35 0.35 31
weighted avg 0.55 0.58 0.55 31
Confusion MatrixΒΆ
Show which categories are confused with each other
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(12, 10))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Museum Artifact Categories', fontsize=14, pad=20)
plt.colorbar()
# Use actual number of classes, not fixed 10
n_classes = len(class_names)
tick_marks = np.arange(n_classes)
plt.xticks(tick_marks, class_names, rotation=45, ha='right')
plt.yticks(tick_marks, class_names)
plt.ylabel('True Category')
plt.xlabel('Predicted Category')
# Add text annotations
thresh = cm.max() / 2.
for i, j in np.ndindex(cm.shape):
plt.text(j, i, format(cm[i, j], 'd'),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black",
fontsize=9)
plt.tight_layout()
plt.show()
Missing Part Prediction: Predict New Artifact CategoryΒΆ
Predict which category a new museum artifact belongs to
# Make predictions for new artifacts
n_new_artifacts = 10
new_artifacts = X_test[:n_new_artifacts]
new_artifacts_scaled = scaler.transform(new_artifacts)
predictions = model.predict(new_artifacts_scaled)
probabilities = model.predict_proba(new_artifacts_scaled)
print("=" * 70)
print("CATEGORY PREDICTIONS FOR NEW ARTIFACTS")
print("=" * 70)
for i in range(n_new_artifacts):
true_label = y_test[i]
pred_label = predictions[i]
confidence = probabilities[i][pred_label] * 100
print(f"\nArtifact {i+1}:")
print(f" True Category: {class_names[true_label]}")
print(f" Prediction: {class_names[pred_label]}")
print(f" Confidence: {confidence:.2f}%")
if true_label == pred_label:
print(f" β Correct prediction!")
else:
print(f" β Incorrect prediction")
# Show top 3 probabilities
top3_indices = np.argsort(probabilities[i])[-3:][::-1]
print(f" Top 3 categories:")
for idx in top3_indices:
print(f" - {class_names[idx]}: {probabilities[i][idx]*100:.2f}%")
======================================================================
CATEGORY PREDICTIONS FOR NEW ARTIFACTS
======================================================================
Artifact 1:
True Category: Painting
Prediction: Painting
Confidence: 22.43%
β Correct prediction!
Top 3 categories:
- Painting: 22.43%
- Ceramic: 16.72%
- Wood Artifact: 14.17%
Artifact 2:
True Category: Sculpture
Prediction: Ceramic
Confidence: 21.81%
β Incorrect prediction
Top 3 categories:
- Ceramic: 21.81%
- Painting: 16.77%
- Textile: 16.42%
Artifact 3:
True Category: Textile
Prediction: Textile
Confidence: 20.72%
β Correct prediction!
Top 3 categories:
- Textile: 20.72%
- Painting: 19.61%
- Wood Artifact: 17.44%
Artifact 4:
True Category: Sculpture
Prediction: Painting
Confidence: 20.73%
β Incorrect prediction
Top 3 categories:
- Painting: 20.73%
- Ceramic: 19.20%
- Sculpture: 18.88%
Artifact 5:
True Category: Metalwork
Prediction: Wood Artifact
Confidence: 24.08%
β Incorrect prediction
Top 3 categories:
- Wood Artifact: 24.08%
- Sculpture: 21.62%
- Ceramic: 18.26%
Artifact 6:
True Category: Painting
Prediction: Painting
Confidence: 27.91%
β Correct prediction!
Top 3 categories:
- Painting: 27.91%
- Ceramic: 18.41%
- Stone Artifact: 15.25%
Artifact 7:
True Category: Painting
Prediction: Painting
Confidence: 26.17%
β Correct prediction!
Top 3 categories:
- Painting: 26.17%
- Sculpture: 16.91%
- Ceramic: 15.47%
Artifact 8:
True Category: Stone Artifact
Prediction: Ceramic
Confidence: 20.13%
β Incorrect prediction
Top 3 categories:
- Ceramic: 20.13%
- Painting: 19.25%
- Sculpture: 16.88%
Artifact 9:
True Category: Painting
Prediction: Painting
Confidence: 19.10%
β Correct prediction!
Top 3 categories:
- Painting: 19.10%
- Ceramic: 16.99%
- Sculpture: 16.86%
Artifact 10:
True Category: Ceramic
Prediction: Ceramic
Confidence: 22.59%
β Correct prediction!
Top 3 categories:
- Ceramic: 22.59%
- Sculpture: 16.51%
- Wood Artifact: 15.64%
Feature Importance AnalysisΒΆ
See which features are more important in distinguishing categories
# Test model performance for each feature
feature_importance = []
for i in range(X_train.shape[1]):
# Remove this feature and retrain model
X_train_reduced = np.delete(X_train_scaled, i, axis=1)
X_test_reduced = np.delete(X_test_scaled, i, axis=1)
model_reduced = MLPClassifier(
hidden_layer_sizes=(64, 32),
activation='relu',
solver='adam',
alpha=0.001,
max_iter=100,
random_state=42,
early_stopping=True,
validation_fraction=0.1,
verbose=False
)
model_reduced.fit(X_train_reduced, y_train)
acc_reduced = accuracy_score(y_test, model_reduced.predict(X_test_reduced))
# Feature importance = full model accuracy - accuracy when feature removed
importance = test_acc - acc_reduced
feature_importance.append(importance)
# Visualize
plt.figure(figsize=(12, 6))
colors = ['red' if x < 0 else 'green' for x in feature_importance]
plt.barh(range(len(feature_names)), feature_importance, color=colors, edgecolor='black')
plt.yticks(range(len(feature_names)), feature_names)
plt.xlabel('Feature Importance (Accuracy Difference)')
plt.title('Museum Artifact Feature Importance')
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
print("\nMost important features:")
sorted_indices = np.argsort(feature_importance)[::-1]
for i, idx in enumerate(sorted_indices[:5]):
print(f"{i+1}. {feature_names[idx]}: {feature_importance[idx]:.4f}")
Most important features: 1. Culture: -0.0968 2. Dimensions: -0.1290 3. End Date: -0.1290 4. Begin Date: -0.1290 5. Tag Count: -0.1613
SummaryΒΆ
Model: Multi-Layer Perceptron (MLP) - 2 hidden layers (64, 32 neurons)
Features: 10 features extracted from artifact metadata (dates, department, classification, medium, culture, dimensions, title, tags)
Results:
- Successfully classified museum artifacts by category
- Can predict category for new artifacts
- Better performance than random guessing
Dataset: Metropolitan Museum of Art (The Met) Collection API - 470,000+ artifacts, free and open access