Exercises#
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.datasets import load_iris
Clone this notebook and implement the exercises yourself, then check the results with the checking methods. Answers are provided below the exercises.
Data Type Classification#
Classify each dataset as nominal, ordinal, discrete, or continuous. Understanding these fundamental data types is crucial for selecting appropriate preprocessing techniques. Replace ‘your_category_here’ with the correct classification for each dataset in the list.
def _ex_1_error_message():
raise ValueError("Exercise 1: Incorrect classification")
def check_ex_1(data):
categories = ["nominal", "ordinal", "discrete", "continuous"]
for d in data:
if d[0] not in categories:
raise ValueError(f"All data classifications should be one of the following: {categories}")
# Answers here
if not data[0][0] == "nominal":
_ex_1_error_message()
if not data[1][0] == "ordinal":
_ex_1_error_message()
if not data[2][0] == "ordinal":
_ex_1_error_message()
if not data[3][0] == "nominal":
_ex_1_error_message()
if not data[4][0] == "discrete":
_ex_1_error_message()
if not data[5][0] == "continuous":
_ex_1_error_message()
print("Exercise 1: Correct!")
def ex_1():
"""For each of the data types below, classify them as
nominal, ordinal, discrete or continuous.
"""
data = [
('your_category_here', ['red', 'blue', 'green', 'red', 'blue']),
('your_category_here', ['low', 'medium', 'high', 'low', 'medium']),
('your_category_here', ['fast', 'slow', 'slowest']),
('your_category_here', ['happy', 'disgusted', 'angry', 'sad', 'happy']),
('your_category_here', [1, 2, 3, 1, 2]),
('your_category_here', [1.5, 2.7, 3.1, 1.8, 2.9]),
]
# Uncomment check
# check_ex_1(data)
ex_1()
Ordinal Encoding#
Transform categorical education levels into numerical values that preserve their natural ordering. Create a mapping where ‘high school’ = 0, ‘bachelor’ = 1, ‘master’ = 2, and ‘phd’ = 3, then encode the given data accordingly.
def check_ex_2(data):
answer = [1, 0, 2, 3, 0]
if not answer == data:
raise ValueError("Exercise 2: Incorrect labels")
print("Exercise 2: Correct!")
def ex_2():
"""Encode the following values using ordinal encoding (and pandas, for practice)."""
data = ['high school', 'bachelor', 'master', 'phd', 'bachelor']
# Encode - Your implementation here
encoded_data = [0, 0, 0, 0, 0]
# Check the mapping, which should be a list of length 'data' where every entry is the correct encoding
# Uncomment check
# check_ex_2(encoded_data)
ex_2()
One-Hot Encoding#
Convert categorical animal names into binary feature vectors where each unique category gets its own column. Each row should have exactly one ‘1’ and the rest ‘0’s, creating a sparse representation suitable for machine learning algorithms.
def check_ex_3(data):
"""Check one-hot encoding implementation."""
expected = [
[1, 0, 0, 0], # cat (in alphabetical order: bird, cat, dog, fish)
[0, 1, 0, 0], # dog
[0, 0, 1, 0], # bird
[1, 0, 0, 0], # cat
[0, 0, 0, 1] # fish
]
if data != expected:
raise ValueError("Exercise 3: Incorrect one-hot encoding")
print("Exercise 3: Correct!")
def ex_3():
"""Implement one-hot encoding for the given data."""
data = ['cat', 'dog', 'bird', 'cat', 'fish']
# Create one-hot encoding - Your implementation here
one_hot_data = [
[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]
]
# Uncomment check
# check_ex_3(one_hot_data)
ex_3()
Basic Statistics Calculation#
Compute fundamental statistical measures (mode, median, mean, variance) for both discrete and continuous datasets. Pay attention to how these statistics differ between data types and what insights they provide about data distribution.
def check_ex_4(discrete_stats, continuous_stats, discrete_data, continuous_data):
"""Check statistics calculation using numpy."""
expected_discrete = {
'mode': max(set(discrete_data), key=discrete_data.count),
'median': np.median(discrete_data),
'mean': np.mean(discrete_data),
'variance': np.var(discrete_data)
}
expected_continuous = {
'mode': max(set(continuous_data), key=continuous_data.count),
'median': np.median(continuous_data),
'mean': np.mean(continuous_data),
'variance': np.var(continuous_data)
}
# Check discrete stats
for key in expected_discrete:
if abs(discrete_stats[key] - expected_discrete[key]) > 1e-6:
raise ValueError(f"Exercise 4: Incorrect {key} for discrete data")
# Check continuous stats
for key in expected_continuous:
if abs(continuous_stats[key] - expected_continuous[key]) > 1e-6:
raise ValueError(f"Exercise 4: Incorrect {key} for continuous data")
print("Exercise 4: Correct!")
def ex_4():
"""Calculate basic statistics for the given data."""
discrete_data = [1, 2, 2, 3, 4, 4, 4, 5]
continuous_data = [1.1, 2.3, 2.3, 3.7, 4.2, 4.2, 4.8, 5.1]
# Calculate statistics - Your implementation here
discrete_stats = {
'mode': 0,
'median': 0,
'mean': 0,
'variance': 0
}
# Calculate statistics for continuous data
continuous_stats = {
'mode': 0,
'median': 0,
'mean': 0,
'variance': 0
}
# Uncomment check
# check_ex_4(discrete_stats, continuous_stats, discrete_data, continuous_data)
ex_4()
Train-Test Split Implementation#
Manually implement an 80/20 train-test split without using sklearn. Ensure your split maintains the original data relationships and provides representative samples for both training and testing phases.
def check_ex_5(training_features, testing_features, training_labels, testing_labels):
"""Check train-test split implementation."""
# Check shapes
if training_features.shape[0] != 4 or testing_features.shape[0] != 2:
raise ValueError("Exercise 5: Incorrect train-test split sizes")
if training_labels.shape[0] != 4 or testing_labels.shape[0] != 2:
raise ValueError("Exercise 5: Incorrect train-test split sizes for labels")
# Check that all data is included
all_features = np.vstack([training_features, testing_features])
all_labels = np.concatenate([training_labels, testing_labels])
if len(all_features) != 6 or len(all_labels) != 6:
raise ValueError("Exercise 5: Data loss in train-test split")
print("Exercise 5: Correct!")
def ex_5():
"""Implement 80/20 train-test split manually."""
# Create sample data
feature_matrix = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target_labels = np.array([0, 1, 0, 1, 0, 1])
# Split data - Your implementation here
training_features = np.array([])
testing_features = np.array([])
training_labels = np.array([])
testing_labels = np.array([])
# Uncomment check
# check_ex_5(training_features, testing_features, training_labels, testing_labels)
ex_5()
Min-Max Scaling#
Scale the iris dataset features to a [0,1] range using the min-max normalization formula: (x - min) / (max - min). This technique preserves the original distribution shape while standardizing the scale across all features.
def check_ex_6(minmax_scaled_features):
"""Check min-max scaling implementation using sklearn."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Use sklearn MinMaxScaler
sklearn_scaler = MinMaxScaler()
sklearn_scaled = sklearn_scaler.fit_transform(feature_data)
# Check that scaled data is between 0 and 1
if np.any(minmax_scaled_features < 0) or np.any(minmax_scaled_features > 1):
raise ValueError("Exercise 6: Min-max scaled values should be between 0 and 1")
# Check that at least one value is 0 and one is 1 for each feature
for col in range(minmax_scaled_features.shape[1]):
if not (np.min(minmax_scaled_features[:, col]) == 0 and np.max(minmax_scaled_features[:, col]) == 1):
raise ValueError("Exercise 6: Min-max scaling should map min to 0 and max to 1")
# Compare with sklearn implementation
if not np.allclose(minmax_scaled_features, sklearn_scaled, rtol=1e-10):
raise ValueError("Exercise 6: Implementation doesn't match sklearn MinMaxScaler")
print("Exercise 6: Correct!")
def ex_6():
"""Implement min-max scaling."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Implement min-max scaling - Your implementation here
minmax_scaled_features = feature_data
# Uncomment check
# check_ex_6(minmax_scaled_features)
ex_6()
Standardization#
Transform the iris dataset to have zero mean and unit variance using the formula: (x - mean) / std. This technique is particularly useful when features have different units or vastly different scales.
def check_ex_7(standardized_features):
"""Check standardization implementation using sklearn."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Use sklearn StandardScaler
sklearn_scaler = StandardScaler()
sklearn_standardized = sklearn_scaler.fit_transform(feature_data)
# Check that standardized data has mean close to 0 and std close to 1
for col in range(standardized_features.shape[1]):
if abs(np.mean(standardized_features[:, col])) > 1e-10:
raise ValueError("Exercise 7: Standardized data should have mean close to 0")
if abs(np.std(standardized_features[:, col]) - 1) > 1e-10:
raise ValueError("Exercise 7: Standardized data should have standard deviation close to 1")
# Compare with sklearn implementation
if not np.allclose(standardized_features, sklearn_standardized, rtol=1e-10):
raise ValueError("Exercise 7: Implementation doesn't match sklearn StandardScaler")
print("Exercise 7: Correct!")
def ex_7():
"""Implement standardization (z-score normalization)."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Implement standardization - Your implementation here
standardized_features = feature_data
# Uncomment check
# check_ex_7(standardized_features)
ex_7()
Robust Scaling#
Apply robust scaling using median and interquartile range (IQR) instead of mean and standard deviation. This method is less sensitive to outliers: (x - median) / IQR, making it ideal for datasets with extreme values.
def check_ex_8(robust_scaled_features):
"""Check robust scaling implementation using sklearn."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Use sklearn RobustScaler
sklearn_scaler = RobustScaler()
sklearn_robust = sklearn_scaler.fit_transform(feature_data)
# Check that robust scaled data has median close to 0
for col in range(robust_scaled_features.shape[1]):
if abs(np.median(robust_scaled_features[:, col])) > 1e-10:
raise ValueError("Exercise 8: Robust scaled data should have median close to 0")
# Compare with sklearn implementation
if not np.allclose(robust_scaled_features, sklearn_robust, rtol=1e-10):
raise ValueError("Exercise 8: Implementation doesn't match sklearn RobustScaler")
print("Exercise 8: Correct!")
def ex_8():
"""Implement robust scaling using median and IQR."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Implement robust scaling - Your implementation here
robust_scaled_features = feature_data
# Uncomment check
# check_ex_8(robust_scaled_features)
ex_8()
Variance Thresholding#
Implement feature selection by removing features with low variance (below 0.5). Low-variance features provide little information for distinguishing between samples and can be safely removed to reduce dimensionality.
def check_ex_9(selected_feature_matrix, high_variance_features):
"""Check variance thresholding implementation."""
# Check that low variance feature (index 2) is removed
if high_variance_features[2]:
raise ValueError("Exercise 9: Low variance feature should be removed")
# Check that high variance features are kept
if not high_variance_features[0] or not high_variance_features[1]:
raise ValueError("Exercise 9: High variance features should be kept")
# Check shape of selected data
if selected_feature_matrix.shape[1] != 2:
raise ValueError("Exercise 9: Should select 2 features")
print("Exercise 9: Correct!")
def ex_9():
"""Implement variance thresholding for feature selection."""
# Create sample data with low variance feature
feature_matrix = np.array([
[1, 2, 0.1],
[2, 3, 0.1],
[3, 4, 0.1],
[4, 5, 0.1],
[5, 6, 0.1]
])
# Select features with variance > threshold (e.g., 0.5) - Your implementation here
variance_threshold = 0.5
high_variance_features = np.array([])
selected_feature_matrix = np.array([])
# Uncomment check
# check_ex_9(selected_feature_matrix, high_variance_features)
ex_9()
Correlation-Based Feature Selection#
Identify and remove highly correlated features to reduce redundancy in your dataset. Calculate pairwise correlations and eliminate features that are highly correlated with others, keeping only the most informative ones.
def check_ex_10(selected_feature_matrix, features_to_keep):
"""Check correlation-based feature selection."""
# Check that some correlated features are removed
if len(features_to_keep) < 2:
raise ValueError("Exercise 10: Should keep at least 2 features")
# Check shape of selected data
if selected_feature_matrix.shape[1] != len(features_to_keep):
raise ValueError("Exercise 10: Shape mismatch between selected data and features to keep")
print("Exercise 10: Correct!")
def ex_10():
"""Implement correlation-based feature selection."""
# Create sample data with two feature pairs that are highly correlated
feature_matrix = np.array([
[1, 2.1, 0.5, 4],
[2, 4.2, 1.2, 8],
[34, 116.1, 12.1, 912],
[10, 8.3, 99.2, 45],
[20.5, 16.2, 200.1, 90]
])
# Select features to keep - Your implementation here
features_to_keep = np.array([])
selected_feature_matrix = np.array([])
# Uncomment check
# check_ex_10(selected_feature_matrix, features_to_keep)
ex_10()
Feature Space Transformation#
Transform a 2-feature matrix \([F1, F2]\) into a new feature space \([F1², F1*F2, F2², log(F2)]\). This polynomial and logarithmic transformation can help capture non-linear relationships in the data.
def check_ex_11(transformed_feature_matrix):
"""Check feature space transformation."""
# Check shape (should be 4 rows, 4 columns)
if transformed_feature_matrix.shape != (4, 4):
raise ValueError("Exercise 11: Incorrect shape of transformed matrix")
# Check specific transformations for first row [1, 2]
expected_first_row = [1, 2, 4, np.log(2)] # F1^2, F1F2, F2^2, log(F2)
for i, val in enumerate(expected_first_row):
if abs(transformed_feature_matrix[0, i] - val) > 1e-6:
raise ValueError(f"Exercise 11: Incorrect transformation at position [0, {i}]")
print("Exercise 11: Correct!")
def ex_11():
"""Transform feature space using polynomial and logarithmic transformations."""
# Create sample data with 2 features
original_feature_matrix = np.array([
[1, 2],
[2, 3],
[3, 4],
[4, 5]
])
# Create transformed matrix - Your implementation here
transformed_feature_matrix = np.array([])
# Uncomment check
# check_ex_11(transformed_feature_matrix)
ex_11()
Answers#
def ex_1():
"""For each of the data types below, classify them as
nominal, ordinal, discrete or continuous.
"""
data = [
('nominal', ['red', 'blue', 'green', 'red', 'blue']),
('ordinal', ['low', 'medium', 'high', 'low', 'medium']),
('ordinal', ['fast', 'slow', 'slowest']),
('nominal', ['happy', 'disgusted', 'angry', 'sad', 'happy']),
('discrete', [1, 2, 3, 1, 2]),
('continuous', [1.5, 2.7, 3.1, 1.8, 2.9]),
]
check_ex_1(data)
def ex_2():
"""Encode the following values using ordinal encoding (and pandas, for practice)."""
data = ['high school', 'bachelor', 'master', 'phd', 'bachelor']
# Encode
pd_data = pd.DataFrame({"education": data})
unique_values = pd_data['education'].unique()
mapping = {val: idx for idx, val in enumerate(sorted(unique_values))}
encoded_data = pd_data['education'].map(mapping)
# Check the mapping, which should be a list of length 'data' where every entry is the correct encoding
check_ex_2(encoded_data.to_list())
def ex_3():
"""Implement one-hot encoding for the given data."""
data = ['cat', 'dog', 'bird', 'cat', 'fish']
# Create DataFrame
df = pd.DataFrame({'animal': data})
# Get unique values in alphabetical order
unique_animals = df['animal'].unique()
# Create one-hot encoding
one_hot_data = []
for animal in data:
encoding = [1 if animal == unique_animal else 0 for unique_animal in unique_animals]
one_hot_data.append(encoding)
check_ex_3(one_hot_data)
def ex_4():
"""Calculate basic statistics for the given data."""
discrete_data = [1, 2, 2, 3, 4, 4, 4, 5]
continuous_data = [1.1, 2.3, 2.3, 3.7, 4.2, 4.2, 4.8, 5.1]
# Calculate statistics for discrete data
discrete_stats = {
'mode': max(set(discrete_data), key=discrete_data.count),
'median': np.median(discrete_data),
'mean': np.mean(discrete_data),
'variance': np.var(discrete_data)
}
# Calculate statistics for continuous data
continuous_stats = {
'mode': max(set(continuous_data), key=continuous_data.count),
'median': np.median(continuous_data),
'mean': np.mean(continuous_data),
'variance': np.var(continuous_data)
}
check_ex_4(discrete_stats, continuous_stats, discrete_data, continuous_data)
def ex_5():
"""Implement 80/20 train-test split manually."""
# Create sample data
feature_matrix = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target_labels = np.array([0, 1, 0, 1, 0, 1])
# Create indices and shuffle
sample_indices = np.arange(len(feature_matrix))
# Split indices (80% train, 20% test)
train_split_index = int(0.8 * len(feature_matrix))
training_indices = sample_indices[:train_split_index]
testing_indices = sample_indices[train_split_index:]
# Split data
training_features = feature_matrix[training_indices]
testing_features = feature_matrix[testing_indices]
training_labels = target_labels[training_indices]
testing_labels = target_labels[testing_indices]
check_ex_5(training_features, testing_features, training_labels, testing_labels)
def ex_6():
"""Implement min-max scaling."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Implement min-max scaling
feature_minimums = feature_data.min(axis=0)
feature_maximums = feature_data.max(axis=0)
minmax_scaled_features = (feature_data - feature_minimums) / (feature_maximums - feature_minimums)
check_ex_6(minmax_scaled_features)
def ex_7():
"""Implement standardization (z-score normalization)."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Implement standardization
feature_means = feature_data.mean(axis=0)
feature_standard_deviations = feature_data.std(axis=0)
standardized_features = (feature_data - feature_means) / feature_standard_deviations
check_ex_7(standardized_features)
def ex_8():
"""Implement robust scaling using median and IQR."""
# Load iris data
iris = load_iris()
feature_data = iris.data
# Implement robust scaling
feature_medians = np.median(feature_data, axis=0)
first_quartiles = np.percentile(feature_data, 25, axis=0)
third_quartiles = np.percentile(feature_data, 75, axis=0)
interquartile_ranges = third_quartiles - first_quartiles
robust_scaled_features = (feature_data - feature_medians) / interquartile_ranges
check_ex_8(robust_scaled_features)
def ex_9():
"""Implement variance thresholding for feature selection."""
# Create sample data with low variance feature
feature_matrix = np.array([
[1, 2, 0.1],
[2, 3, 0.1],
[3, 4, 0.1],
[4, 5, 0.1],
[5, 6, 0.1]
])
# Calculate variance for each feature
feature_variances = np.var(feature_matrix, axis=0)
# Select features with variance > threshold (e.g., 0.5)
variance_threshold = 0.5
high_variance_features = feature_variances > variance_threshold
selected_feature_matrix = feature_matrix[:, high_variance_features]
check_ex_9(selected_feature_matrix, high_variance_features)
def ex_10():
"""Implement correlation-based feature selection."""
# Create sample data with two feature pairs that are highly correlated
feature_matrix = np.array([
[1, 2.1, 0.5, 4],
[2, 4.2, 1.2, 8],
[34, 116.1, 12.1, 912],
[10, 8.3, 99.2, 45],
[20.5, 16.2, 200.1, 90]
])
# Calculate correlation matrix
correlation_matrix = np.corrcoef(feature_matrix.T)
# Find highly correlated features (correlation > 0.9)
correlation_threshold = 0.9
num_features = feature_matrix.shape[1]
features_to_remove = set()
for i in range(num_features):
for j in range(i+1, num_features):
if abs(correlation_matrix[i, j]) > correlation_threshold:
features_to_remove.add(j) # Drop the second feature
# Select features to keep
features_to_keep = [i for i in range(num_features) if i not in features_to_remove]
selected_feature_matrix = feature_matrix[:, features_to_keep]
check_ex_10(selected_feature_matrix, features_to_keep)
def ex_11():
"""Transform feature space using polynomial and logarithmic transformations."""
# Create sample data with 2 features
original_feature_matrix = np.array([
[1, 2],
[2, 3],
[3, 4],
[4, 5]
])
feature_one = original_feature_matrix[:, 0]
feature_two = original_feature_matrix[:, 1]
# Apply transformations
feature_one_squared = feature_one ** 2
feature_one_two_product = feature_one * feature_two
feature_two_squared = feature_two ** 2
log_feature_two = np.log(feature_two)
# Create transformed matrix
transformed_feature_matrix = np.column_stack([feature_one_squared, feature_one_two_product, feature_two_squared, log_feature_two])
check_ex_11(transformed_feature_matrix)
def run_all():
ex_1()
ex_2()
ex_3()
ex_4()
ex_5()
ex_6()
ex_7()
ex_8()
ex_9()
ex_10()
ex_11()
run_all()
Exercise 1: Correct!
Exercise 2: Correct!
Exercise 3: Correct!
Exercise 4: Correct!
Exercise 5: Correct!
Exercise 6: Correct!
Exercise 7: Correct!
Exercise 8: Correct!
Exercise 9: Correct!
Exercise 10: Correct!
Exercise 11: Correct!