.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "auto_examples/compare_dim_reduction.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_auto_examples_compare_dim_reduction.py: Compare different dimension reduction methods on MNIST and Fashion-MNIST datasets. This script creates grid visualizations comparing different dimension reduction methods (PCA, t-SNE, UMAP, and Pacmap) with different parameter settings for MNIST and Fashion-MNIST. .. GENERATED FROM PYTHON SOURCE LINES 9-351 .. code-block:: Python import logging import time import warnings from pathlib import Path import matplotlib.pyplot as plt import numpy as np from coco_pipe.dim_reduction import DimReduction # Suppress the deprecation warning warnings.filterwarnings( "ignore", message="'force_all_finite' was renamed to 'ensure_all_finite'" ) # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") logger = logging.getLogger(__name__) # Define directories DATA_DIR = Path("test_data/dim_reduction") OUTPUT_DIR = Path("examples/outputs") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Define class names for datasets MNIST_CLASSES = [str(i) for i in range(10)] FASHION_MNIST_CLASSES = [ "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot", ] def load_dataset(prefix): """ Load a dataset from test_data/dim_reduction. Args: prefix (str): Dataset prefix (mnist or fashion_mnist) Returns: tuple: (X, y, class_names) """ data_path = DATA_DIR / f"{prefix}_data.npy" labels_path = DATA_DIR / f"{prefix}_labels.npy" if not data_path.exists() or not labels_path.exists(): logger.error(f"Dataset files not found: {data_path} or {labels_path}") logger.error( "Run scripts/download_mnist_datasets.py first to download the datasets" ) return None, None, None logger.info(f"Loading {prefix} dataset...") X = np.load(data_path, allow_pickle=True) y = np.load(labels_path, allow_pickle=True) # Assign class names based on dataset if prefix == "mnist": class_names = MNIST_CLASSES elif prefix == "fashion_mnist": class_names = FASHION_MNIST_CLASSES else: class_names = [str(i) for i in range(len(np.unique(y)))] return X, y, class_names def apply_dim_reduction(X, method, n_components=2, **kwargs): """ Apply a dimension reduction method to the data. Args: X (np.ndarray): Input data method (str): Dimension reduction method (PCA, TSNE, UMAP, Pacmap) n_components (int): Number of components in the reduced space **kwargs: Additional keyword arguments for the reducer Returns: tuple: (X_reduced, elapsed_time) """ logger.info( f"Applying {method} with params {kwargs} to reduce to {n_components} " "dimensions..." ) # Initialize DimReduction reducer = DimReduction(method=method, n_components=n_components, **kwargs) # Apply dimension reduction with timing start_time = time.time() X_reduced = reducer.fit_transform(X) elapsed_time = time.time() - start_time logger.info(f"{method} completed in {elapsed_time:.2f} seconds") return X_reduced, elapsed_time def visualize_grid(results, y, class_names, dataset_name): """ Create a grid visualization of dimension reduction results with different parameters. Args: results (dict): Dictionary mapping method names to lists of (X_reduced, param_str, elapsed_time) tuples y (np.ndarray): Labels class_names (list): Names of classes dataset_name (str): Name of the dataset """ # Get methods and number of parameter variations for each methods = list(results.keys()) n_methods = len(methods) n_params = max(len(results[method]) for method in methods) # Create figure with a grid of subplots fig, axes = plt.subplots(n_params, n_methods, figsize=(n_methods * 4, n_params * 4)) # If only one row, make axes 2D if n_params == 1: axes = axes.reshape(1, -1) # Convert y to integers for coloring y_int = y.astype(int) if not np.issubdtype(y.dtype, np.integer) else y # Loop through methods (columns) for col, method in enumerate(methods): method_results = results[method] # Loop through parameters for this method (rows) for row, (X_reduced, param_str, _) in enumerate(method_results): ax = axes[row, col] # Plot with different colors for each class ax.scatter( X_reduced[:, 0], X_reduced[:, 1], c=y_int, cmap="tab10", alpha=0.5, s=1 ) # Set titles if row == 0: ax.set_title(f"{method}", fontsize=14, pad=10) # Add parameter information ax.set_xlabel(param_str, fontsize=10) # Remove ticks for cleaner visualization ax.set_xticks([]) ax.set_yticks([]) # Fill empty subplots if any for method in methods: n_results = len(results[method]) for row in range(n_results, n_params): ax = axes[row, methods.index(method)] ax.axis("off") plt.suptitle( f"Dimension Reduction Comparison - {dataset_name}", fontsize=18, y=0.98 ) plt.tight_layout() # Save figure output_path = OUTPUT_DIR / f"{dataset_name.lower()}_grid_comparison.png" plt.savefig(output_path, dpi=300, bbox_inches="tight") logger.info(f"Grid visualization saved to {output_path}") # Also save as SVG for high-quality reproduction svg_path = OUTPUT_DIR / f"{dataset_name.lower()}_grid_comparison.svg" plt.savefig(svg_path, format="svg", bbox_inches="tight") plt.close() # Create a performance comparison table fig, ax = plt.subplots(figsize=(10, 6)) ax.axis("tight") ax.axis("off") # Prepare data for the table table_data = [] for method in methods: for X_reduced, param_str, elapsed_time in results[method]: table_data.append([method, param_str, f"{elapsed_time:.2f}s"]) # Create the table table = ax.table( cellText=table_data, colLabels=["Method", "Parameters", "Computation Time"], loc="center", cellLoc="center", ) table.auto_set_font_size(False) table.set_fontsize(12) table.scale(1.2, 1.5) plt.title(f"Performance Comparison - {dataset_name}", fontsize=16, pad=20) # Save the table table_path = OUTPUT_DIR / f"{dataset_name.lower()}_performance_table.png" plt.savefig(table_path, dpi=300, bbox_inches="tight") plt.close() def main(): """ Main function to run the dimension reduction comparison. This function: 1. Loads both MNIST and Fashion-MNIST datasets 2. Applies dimension reduction with multiple methods and parameters 3. Visualizes the results in a grid layout """ # Ensure the output directory exists OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Parameters n_components = 2 n_samples = 5000 # Use a smaller subset for faster computation # Define methods and parameters to test method_params = { "PCA": [ ({}, "Default"), ({"svd_solver": "randomized"}, "Randomized SVD"), ({"whiten": True}, "Whitened"), ], "TSNE": [ ({"perplexity": 10, "max_iter": 1000}, "perplexity=10"), ({"perplexity": 30, "max_iter": 1000}, "perplexity=30"), ({"perplexity": 50, "max_iter": 1000}, "perplexity=50"), ], "UMAP": [ ( { "n_neighbors": 10, "min_dist": 0.1, "random_state": None, "n_jobs": -1, }, "n_neighbors=10", ), ( { "n_neighbors": 20, "min_dist": 0.1, "random_state": None, "n_jobs": -1, }, "n_neighbors=20", ), ( { "n_neighbors": 40, "min_dist": 0.1, "random_state": None, "n_jobs": -1, }, "n_neighbors=40", ), ], "Pacmap": [ ({"n_neighbors": 10, "MN_ratio": 0.5, "FP_ratio": 2.0}, "n_neighbors=10"), ({"n_neighbors": 20, "MN_ratio": 0.5, "FP_ratio": 2.0}, "n_neighbors=20"), ({"n_neighbors": 40, "MN_ratio": 0.5, "FP_ratio": 2.0}, "n_neighbors=40"), ], "Trimap": [ ({"n_inliers": 10, "n_outliers": 5, "n_random": 5}, "n_inliers=10"), ({"n_inliers": 20, "n_outliers": 5, "n_random": 5}, "n_inliers=20"), ({"n_inliers": 40, "n_outliers": 5, "n_random": 5}, "n_inliers=40"), ], "PHATE": [ ({"knn": 5, "t": "auto"}, "knn=5"), ({"knn": 10, "t": "auto"}, "knn=10"), ({"knn": 20, "t": "auto"}, "knn=20"), ], } # Process both datasets for dataset_name, prefix in [ ("MNIST", "mnist"), ("Fashion-MNIST", "fashion_mnist"), ]: # Load dataset X, y, class_names = load_dataset(prefix) if X is None: continue # Use a subset of samples for faster computation if len(X) > n_samples: logger.info(f"Using {n_samples} samples from {len(X)} total") indices = np.random.choice(len(X), n_samples, replace=False) X = X[indices] y = y[indices] # Results dictionary to store all outputs results = {method: [] for method in method_params} # Print methods being compared logger.info(f"Comparing methods: {', '.join(method_params.keys())}") # Apply dimension reduction with each method and parameter set for method, param_sets in method_params.items(): for params, param_str in param_sets: try: X_reduced, elapsed_time = apply_dim_reduction( X, method, n_components=n_components, **params ) results[method].append((X_reduced, param_str, elapsed_time)) except Exception as e: logger.error( f"Error applying {method} with {param_str} to {dataset_name}: " f"{e}" ) # Visualize results if any(results.values()): visualize_grid(results, y, class_names, dataset_name) # Print performance comparison logger.info(f"\n{dataset_name} Performance Comparison:") for method, method_results in results.items(): for _, param_str, elapsed_time in method_results: logger.info(f" {method} ({param_str}): {elapsed_time:.2f} seconds") else: logger.error( f"No successful dimension reduction methods for {dataset_name}" ) if __name__ == "__main__": main() .. rst-class:: sphx-glr-timing **Total running time of the script:** (0 minutes 0.004 seconds) .. _sphx_glr_download_auto_examples_compare_dim_reduction.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: compare_dim_reduction.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: compare_dim_reduction.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: compare_dim_reduction.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_