Exercises#

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.datasets import load_iris

Clone this notebook and implement the exercises yourself, then check the results with the checking methods. Answers are provided below the exercises.

Data Type Classification#

Classify each dataset as nominal, ordinal, discrete, or continuous. Understanding these fundamental data types is crucial for selecting appropriate preprocessing techniques. Replace ‘your_category_here’ with the correct classification for each dataset in the list.

def _ex_1_error_message():
    raise ValueError("Exercise 1: Incorrect classification")

def check_ex_1(data):
    categories = ["nominal", "ordinal", "discrete", "continuous"]
    for d in data:
        if d[0] not in categories:
            raise ValueError(f"All data classifications should be one of the following: {categories}")

    # Answers here
    if not data[0][0] == "nominal":
        _ex_1_error_message()
    if not data[1][0] == "ordinal":
        _ex_1_error_message()
    if not data[2][0] == "ordinal":
        _ex_1_error_message()
    if not data[3][0] == "nominal":
        _ex_1_error_message()
    if not data[4][0] == "discrete":
        _ex_1_error_message()
    if not data[5][0] == "continuous":
        _ex_1_error_message()

    print("Exercise 1: Correct!")
def ex_1():
    """For each of the data types below, classify them as
    nominal, ordinal, discrete or continuous.
    """
    data = [
        ('your_category_here', ['red', 'blue', 'green', 'red', 'blue']),
        ('your_category_here', ['low', 'medium', 'high', 'low', 'medium']),
        ('your_category_here', ['fast', 'slow', 'slowest']),
        ('your_category_here', ['happy', 'disgusted', 'angry', 'sad', 'happy']),
        ('your_category_here', [1, 2, 3, 1, 2]),
        ('your_category_here', [1.5, 2.7, 3.1, 1.8, 2.9]),
    ]
    # Uncomment check
    # check_ex_1(data)

ex_1()

Ordinal Encoding#

Transform categorical education levels into numerical values that preserve their natural ordering. Create a mapping where ‘high school’ = 0, ‘bachelor’ = 1, ‘master’ = 2, and ‘phd’ = 3, then encode the given data accordingly.

def check_ex_2(data):
    answer = [1, 0, 2, 3, 0]
    if not answer == data:
        raise ValueError("Exercise 2: Incorrect labels")
    print("Exercise 2: Correct!")
def ex_2():
    """Encode the following values using ordinal encoding (and pandas, for practice)."""
    data = ['high school', 'bachelor', 'master', 'phd', 'bachelor']

    # Encode - Your implementation here
    encoded_data = [0, 0, 0, 0, 0]

    # Check the mapping, which should be a list of length 'data' where every entry is the correct encoding
    # Uncomment check
    # check_ex_2(encoded_data)

ex_2()

One-Hot Encoding#

Convert categorical animal names into binary feature vectors where each unique category gets its own column. Each row should have exactly one ‘1’ and the rest ‘0’s, creating a sparse representation suitable for machine learning algorithms.

def check_ex_3(data):
    """Check one-hot encoding implementation."""
    expected = [
        [1, 0, 0, 0],  # cat (in alphabetical order: bird, cat, dog, fish)
        [0, 1, 0, 0],  # dog
        [0, 0, 1, 0],  # bird
        [1, 0, 0, 0],  # cat
        [0, 0, 0, 1]   # fish
    ]

    if data != expected:
        raise ValueError("Exercise 3: Incorrect one-hot encoding")
    print("Exercise 3: Correct!")
def ex_3():
    """Implement one-hot encoding for the given data."""
    data = ['cat', 'dog', 'bird', 'cat', 'fish']

    # Create one-hot encoding - Your implementation here
    one_hot_data = [
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]
    ]

    # Uncomment check
    # check_ex_3(one_hot_data)

ex_3()

Basic Statistics Calculation#

Compute fundamental statistical measures (mode, median, mean, variance) for both discrete and continuous datasets. Pay attention to how these statistics differ between data types and what insights they provide about data distribution.

def check_ex_4(discrete_stats, continuous_stats, discrete_data, continuous_data):
    """Check statistics calculation using numpy."""
    expected_discrete = {
        'mode': max(set(discrete_data), key=discrete_data.count),
        'median': np.median(discrete_data),
        'mean': np.mean(discrete_data),
        'variance': np.var(discrete_data)
    }

    expected_continuous = {
        'mode': max(set(continuous_data), key=continuous_data.count),
        'median': np.median(continuous_data),
        'mean': np.mean(continuous_data),
        'variance': np.var(continuous_data)
    }

    # Check discrete stats
    for key in expected_discrete:
        if abs(discrete_stats[key] - expected_discrete[key]) > 1e-6:
            raise ValueError(f"Exercise 4: Incorrect {key} for discrete data")

    # Check continuous stats
    for key in expected_continuous:
        if abs(continuous_stats[key] - expected_continuous[key]) > 1e-6:
            raise ValueError(f"Exercise 4: Incorrect {key} for continuous data")

    print("Exercise 4: Correct!")
def ex_4():
    """Calculate basic statistics for the given data."""
    discrete_data = [1, 2, 2, 3, 4, 4, 4, 5]
    continuous_data = [1.1, 2.3, 2.3, 3.7, 4.2, 4.2, 4.8, 5.1]

    # Calculate statistics - Your implementation here
    discrete_stats = {
        'mode': 0,
        'median': 0,
        'mean': 0,
        'variance': 0
    }

    # Calculate statistics for continuous data
    continuous_stats = {
        'mode': 0,
        'median': 0,
        'mean': 0,
        'variance': 0
    }

    # Uncomment check
    # check_ex_4(discrete_stats, continuous_stats, discrete_data, continuous_data)

ex_4()

Train-Test Split Implementation#

Manually implement an 80/20 train-test split without using sklearn. Ensure your split maintains the original data relationships and provides representative samples for both training and testing phases.

def check_ex_5(training_features, testing_features, training_labels, testing_labels):
    """Check train-test split implementation."""
    # Check shapes
    if training_features.shape[0] != 4 or testing_features.shape[0] != 2:
        raise ValueError("Exercise 5: Incorrect train-test split sizes")

    if training_labels.shape[0] != 4 or testing_labels.shape[0] != 2:
        raise ValueError("Exercise 5: Incorrect train-test split sizes for labels")

    # Check that all data is included
    all_features = np.vstack([training_features, testing_features])
    all_labels = np.concatenate([training_labels, testing_labels])

    if len(all_features) != 6 or len(all_labels) != 6:
        raise ValueError("Exercise 5: Data loss in train-test split")

    print("Exercise 5: Correct!")
def ex_5():
    """Implement 80/20 train-test split manually."""
    # Create sample data
    feature_matrix = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    target_labels = np.array([0, 1, 0, 1, 0, 1])

    # Split data - Your implementation here
    training_features = np.array([])
    testing_features = np.array([])
    training_labels = np.array([])
    testing_labels = np.array([])

    # Uncomment check
    # check_ex_5(training_features, testing_features, training_labels, testing_labels)

ex_5()

Min-Max Scaling#

Scale the iris dataset features to a [0,1] range using the min-max normalization formula: (x - min) / (max - min). This technique preserves the original distribution shape while standardizing the scale across all features.

def check_ex_6(minmax_scaled_features):
    """Check min-max scaling implementation using sklearn."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Use sklearn MinMaxScaler
    sklearn_scaler = MinMaxScaler()
    sklearn_scaled = sklearn_scaler.fit_transform(feature_data)

    # Check that scaled data is between 0 and 1
    if np.any(minmax_scaled_features < 0) or np.any(minmax_scaled_features > 1):
        raise ValueError("Exercise 6: Min-max scaled values should be between 0 and 1")

    # Check that at least one value is 0 and one is 1 for each feature
    for col in range(minmax_scaled_features.shape[1]):
        if not (np.min(minmax_scaled_features[:, col]) == 0 and np.max(minmax_scaled_features[:, col]) == 1):
            raise ValueError("Exercise 6: Min-max scaling should map min to 0 and max to 1")

    # Compare with sklearn implementation
    if not np.allclose(minmax_scaled_features, sklearn_scaled, rtol=1e-10):
        raise ValueError("Exercise 6: Implementation doesn't match sklearn MinMaxScaler")

    print("Exercise 6: Correct!")
def ex_6():
    """Implement min-max scaling."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Implement min-max scaling - Your implementation here
    minmax_scaled_features = feature_data

    # Uncomment check
    # check_ex_6(minmax_scaled_features)

ex_6()

Standardization#

Transform the iris dataset to have zero mean and unit variance using the formula: (x - mean) / std. This technique is particularly useful when features have different units or vastly different scales.

def check_ex_7(standardized_features):
    """Check standardization implementation using sklearn."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Use sklearn StandardScaler
    sklearn_scaler = StandardScaler()
    sklearn_standardized = sklearn_scaler.fit_transform(feature_data)

    # Check that standardized data has mean close to 0 and std close to 1
    for col in range(standardized_features.shape[1]):
        if abs(np.mean(standardized_features[:, col])) > 1e-10:
            raise ValueError("Exercise 7: Standardized data should have mean close to 0")
        if abs(np.std(standardized_features[:, col]) - 1) > 1e-10:
            raise ValueError("Exercise 7: Standardized data should have standard deviation close to 1")

    # Compare with sklearn implementation
    if not np.allclose(standardized_features, sklearn_standardized, rtol=1e-10):
        raise ValueError("Exercise 7: Implementation doesn't match sklearn StandardScaler")

    print("Exercise 7: Correct!")
def ex_7():
    """Implement standardization (z-score normalization)."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Implement standardization - Your implementation here
    standardized_features = feature_data

    # Uncomment check
    # check_ex_7(standardized_features)

ex_7()

Robust Scaling#

Apply robust scaling using median and interquartile range (IQR) instead of mean and standard deviation. This method is less sensitive to outliers: (x - median) / IQR, making it ideal for datasets with extreme values.

def check_ex_8(robust_scaled_features):
    """Check robust scaling implementation using sklearn."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Use sklearn RobustScaler
    sklearn_scaler = RobustScaler()
    sklearn_robust = sklearn_scaler.fit_transform(feature_data)

    # Check that robust scaled data has median close to 0
    for col in range(robust_scaled_features.shape[1]):
        if abs(np.median(robust_scaled_features[:, col])) > 1e-10:
            raise ValueError("Exercise 8: Robust scaled data should have median close to 0")

    # Compare with sklearn implementation
    if not np.allclose(robust_scaled_features, sklearn_robust, rtol=1e-10):
        raise ValueError("Exercise 8: Implementation doesn't match sklearn RobustScaler")

    print("Exercise 8: Correct!")
def ex_8():
    """Implement robust scaling using median and IQR."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Implement robust scaling - Your implementation here
    robust_scaled_features = feature_data

    # Uncomment check
    # check_ex_8(robust_scaled_features)

ex_8()

Variance Thresholding#

Implement feature selection by removing features with low variance (below 0.5). Low-variance features provide little information for distinguishing between samples and can be safely removed to reduce dimensionality.

def check_ex_9(selected_feature_matrix, high_variance_features):
    """Check variance thresholding implementation."""
    # Check that low variance feature (index 2) is removed
    if high_variance_features[2]:
        raise ValueError("Exercise 9: Low variance feature should be removed")

    # Check that high variance features are kept
    if not high_variance_features[0] or not high_variance_features[1]:
        raise ValueError("Exercise 9: High variance features should be kept")

    # Check shape of selected data
    if selected_feature_matrix.shape[1] != 2:
        raise ValueError("Exercise 9: Should select 2 features")

    print("Exercise 9: Correct!")
def ex_9():
    """Implement variance thresholding for feature selection."""
    # Create sample data with low variance feature
    feature_matrix = np.array([
        [1, 2, 0.1],
        [2, 3, 0.1],
        [3, 4, 0.1],
        [4, 5, 0.1],
        [5, 6, 0.1]
    ])

    # Select features with variance > threshold (e.g., 0.5) - Your implementation here
    variance_threshold = 0.5

    high_variance_features = np.array([])
    selected_feature_matrix = np.array([])

    # Uncomment check
    # check_ex_9(selected_feature_matrix, high_variance_features)

ex_9()

Correlation-Based Feature Selection#

Identify and remove highly correlated features to reduce redundancy in your dataset. Calculate pairwise correlations and eliminate features that are highly correlated with others, keeping only the most informative ones.

def check_ex_10(selected_feature_matrix, features_to_keep):
    """Check correlation-based feature selection."""
    # Check that some correlated features are removed
    if len(features_to_keep) < 2:
        raise ValueError("Exercise 10: Should keep at least 2 features")

    # Check shape of selected data
    if selected_feature_matrix.shape[1] != len(features_to_keep):
        raise ValueError("Exercise 10: Shape mismatch between selected data and features to keep")

    print("Exercise 10: Correct!")
def ex_10():
    """Implement correlation-based feature selection."""
    # Create sample data with two feature pairs that are highly correlated
    feature_matrix = np.array([
        [1, 2.1, 0.5, 4],
        [2, 4.2, 1.2, 8],
        [34, 116.1, 12.1, 912],
        [10, 8.3, 99.2, 45],
        [20.5, 16.2, 200.1, 90]
    ])

    # Select features to keep - Your implementation here
    features_to_keep = np.array([])
    selected_feature_matrix = np.array([])

    # Uncomment check
    # check_ex_10(selected_feature_matrix, features_to_keep)

ex_10()

Feature Space Transformation#

Transform a 2-feature matrix \([F1, F2]\) into a new feature space \([F1², F1*F2, F2², log(F2)]\). This polynomial and logarithmic transformation can help capture non-linear relationships in the data.

def check_ex_11(transformed_feature_matrix):
    """Check feature space transformation."""
    # Check shape (should be 4 rows, 4 columns)
    if transformed_feature_matrix.shape != (4, 4):
        raise ValueError("Exercise 11: Incorrect shape of transformed matrix")

    # Check specific transformations for first row [1, 2]
    expected_first_row = [1, 2, 4, np.log(2)]  # F1^2, F1F2, F2^2, log(F2)

    for i, val in enumerate(expected_first_row):
        if abs(transformed_feature_matrix[0, i] - val) > 1e-6:
            raise ValueError(f"Exercise 11: Incorrect transformation at position [0, {i}]")

    print("Exercise 11: Correct!")
def ex_11():
    """Transform feature space using polynomial and logarithmic transformations."""
    # Create sample data with 2 features
    original_feature_matrix = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5]
    ])

    # Create transformed matrix - Your implementation here
    transformed_feature_matrix = np.array([])

    # Uncomment check
    # check_ex_11(transformed_feature_matrix)

ex_11()

Answers#

def ex_1():
    """For each of the data types below, classify them as
    nominal, ordinal, discrete or continuous.
    """
    data = [
        ('nominal', ['red', 'blue', 'green', 'red', 'blue']),
        ('ordinal', ['low', 'medium', 'high', 'low', 'medium']),
        ('ordinal', ['fast', 'slow', 'slowest']),
        ('nominal', ['happy', 'disgusted', 'angry', 'sad', 'happy']),
        ('discrete', [1, 2, 3, 1, 2]),
        ('continuous', [1.5, 2.7, 3.1, 1.8, 2.9]),
    ]
    check_ex_1(data)
def ex_2():
    """Encode the following values using ordinal encoding (and pandas, for practice)."""
    data = ['high school', 'bachelor', 'master', 'phd', 'bachelor']

    # Encode
    pd_data = pd.DataFrame({"education": data})
    unique_values = pd_data['education'].unique()
    mapping = {val: idx for idx, val in enumerate(sorted(unique_values))}
    encoded_data = pd_data['education'].map(mapping)

    # Check the mapping, which should be a list of length 'data' where every entry is the correct encoding
    check_ex_2(encoded_data.to_list())
def ex_3():
    """Implement one-hot encoding for the given data."""
    data = ['cat', 'dog', 'bird', 'cat', 'fish']

    # Create DataFrame
    df = pd.DataFrame({'animal': data})

    # Get unique values in alphabetical order
    unique_animals = df['animal'].unique()

    # Create one-hot encoding
    one_hot_data = []
    for animal in data:
        encoding = [1 if animal == unique_animal else 0 for unique_animal in unique_animals]
        one_hot_data.append(encoding)

    check_ex_3(one_hot_data)
def ex_4():
    """Calculate basic statistics for the given data."""
    discrete_data = [1, 2, 2, 3, 4, 4, 4, 5]
    continuous_data = [1.1, 2.3, 2.3, 3.7, 4.2, 4.2, 4.8, 5.1]

    # Calculate statistics for discrete data
    discrete_stats = {
        'mode': max(set(discrete_data), key=discrete_data.count),
        'median': np.median(discrete_data),
        'mean': np.mean(discrete_data),
        'variance': np.var(discrete_data)
    }

    # Calculate statistics for continuous data
    continuous_stats = {
        'mode': max(set(continuous_data), key=continuous_data.count),
        'median': np.median(continuous_data),
        'mean': np.mean(continuous_data),
        'variance': np.var(continuous_data)
    }

    check_ex_4(discrete_stats, continuous_stats, discrete_data, continuous_data)
def ex_5():
    """Implement 80/20 train-test split manually."""
    # Create sample data
    feature_matrix = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
    target_labels = np.array([0, 1, 0, 1, 0, 1])

    # Create indices and shuffle
    sample_indices = np.arange(len(feature_matrix))

    # Split indices (80% train, 20% test)
    train_split_index = int(0.8 * len(feature_matrix))
    training_indices = sample_indices[:train_split_index]
    testing_indices = sample_indices[train_split_index:]

    # Split data
    training_features = feature_matrix[training_indices]
    testing_features = feature_matrix[testing_indices]
    training_labels = target_labels[training_indices]
    testing_labels = target_labels[testing_indices]

    check_ex_5(training_features, testing_features, training_labels, testing_labels)
def ex_6():
    """Implement min-max scaling."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Implement min-max scaling
    feature_minimums = feature_data.min(axis=0)
    feature_maximums = feature_data.max(axis=0)
    minmax_scaled_features = (feature_data - feature_minimums) / (feature_maximums - feature_minimums)

    check_ex_6(minmax_scaled_features)
def ex_7():
    """Implement standardization (z-score normalization)."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Implement standardization
    feature_means = feature_data.mean(axis=0)
    feature_standard_deviations = feature_data.std(axis=0)
    standardized_features = (feature_data - feature_means) / feature_standard_deviations

    check_ex_7(standardized_features)
def ex_8():
    """Implement robust scaling using median and IQR."""
    # Load iris data
    iris = load_iris()
    feature_data = iris.data

    # Implement robust scaling
    feature_medians = np.median(feature_data, axis=0)
    first_quartiles = np.percentile(feature_data, 25, axis=0)
    third_quartiles = np.percentile(feature_data, 75, axis=0)
    interquartile_ranges = third_quartiles - first_quartiles
    robust_scaled_features = (feature_data - feature_medians) / interquartile_ranges

    check_ex_8(robust_scaled_features)
def ex_9():
    """Implement variance thresholding for feature selection."""
    # Create sample data with low variance feature
    feature_matrix = np.array([
        [1, 2, 0.1],
        [2, 3, 0.1],
        [3, 4, 0.1],
        [4, 5, 0.1],
        [5, 6, 0.1]
    ])

    # Calculate variance for each feature
    feature_variances = np.var(feature_matrix, axis=0)

    # Select features with variance > threshold (e.g., 0.5)
    variance_threshold = 0.5
    high_variance_features = feature_variances > variance_threshold
    selected_feature_matrix = feature_matrix[:, high_variance_features]

    check_ex_9(selected_feature_matrix, high_variance_features)
def ex_10():
    """Implement correlation-based feature selection."""
    # Create sample data with two feature pairs that are highly correlated
    feature_matrix = np.array([
        [1, 2.1, 0.5, 4],
        [2, 4.2, 1.2, 8],
        [34, 116.1, 12.1, 912],
        [10, 8.3, 99.2, 45],
        [20.5, 16.2, 200.1, 90]
    ])

    # Calculate correlation matrix
    correlation_matrix = np.corrcoef(feature_matrix.T)

    # Find highly correlated features (correlation > 0.9)
    correlation_threshold = 0.9
    num_features = feature_matrix.shape[1]
    features_to_remove = set()

    for i in range(num_features):
        for j in range(i+1, num_features):
            if abs(correlation_matrix[i, j]) > correlation_threshold:
                features_to_remove.add(j)  # Drop the second feature

    # Select features to keep
    features_to_keep = [i for i in range(num_features) if i not in features_to_remove]
    selected_feature_matrix = feature_matrix[:, features_to_keep]

    check_ex_10(selected_feature_matrix, features_to_keep)
def ex_11():
    """Transform feature space using polynomial and logarithmic transformations."""
    # Create sample data with 2 features
    original_feature_matrix = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5]
    ])

    feature_one = original_feature_matrix[:, 0]
    feature_two = original_feature_matrix[:, 1]

    # Apply transformations
    feature_one_squared = feature_one ** 2
    feature_one_two_product = feature_one * feature_two
    feature_two_squared = feature_two ** 2
    log_feature_two = np.log(feature_two)

    # Create transformed matrix
    transformed_feature_matrix = np.column_stack([feature_one_squared, feature_one_two_product, feature_two_squared, log_feature_two])

    check_ex_11(transformed_feature_matrix)
def run_all():
    ex_1()
    ex_2()
    ex_3()
    ex_4()
    ex_5()
    ex_6()
    ex_7()
    ex_8()
    ex_9()
    ex_10()
    ex_11()

run_all()
Exercise 1: Correct!
Exercise 2: Correct!
Exercise 3: Correct!
Exercise 4: Correct!
Exercise 5: Correct!
Exercise 6: Correct!
Exercise 7: Correct!
Exercise 8: Correct!
Exercise 9: Correct!
Exercise 10: Correct!
Exercise 11: Correct!