.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "auto_examples/compare_phate_umap.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_auto_examples_compare_phate_umap.py: Compare PHATE and UMAP dimensionality reduction methods on MNIST and Fashion-MNIST datasets. This script creates visualizations comparing PHATE and UMAP with different parameter settings, showing how they perform on the same datasets. .. GENERATED FROM PYTHON SOURCE LINES 9-334 .. code-block:: Python import logging import time import warnings from pathlib import Path import matplotlib.pyplot as plt import numpy as np import phate import umap # Suppress the deprecation warning warnings.filterwarnings( "ignore", message="'force_all_finite' was renamed to 'ensure_all_finite'" ) # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") logger = logging.getLogger(__name__) # Define directories DATA_DIR = Path("test_data/dim_reduction") OUTPUT_DIR = Path("examples/outputs") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Define class names for datasets MNIST_CLASSES = [str(i) for i in range(10)] FASHION_MNIST_CLASSES = [ "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot", ] def load_dataset(prefix): """ Load a dataset from test_data/dim_reduction. Args: prefix (str): Dataset prefix (mnist or fashion_mnist) Returns: tuple: (X, y, class_names) """ data_path = DATA_DIR / f"{prefix}_data.npy" labels_path = DATA_DIR / f"{prefix}_labels.npy" if not data_path.exists() or not labels_path.exists(): logger.error(f"Dataset files not found: {data_path} or {labels_path}") logger.error( "Run scripts/download_mnist_datasets.py first to download the datasets" ) return None, None, None logger.info(f"Loading {prefix} dataset...") X = np.load(data_path, allow_pickle=True) y = np.load(labels_path, allow_pickle=True) # Assign class names based on dataset if prefix == "mnist": class_names = MNIST_CLASSES elif prefix == "fashion_mnist": class_names = FASHION_MNIST_CLASSES else: class_names = [str(i) for i in range(len(np.unique(y)))] return X, y, class_names def apply_phate(X, param_str, knn=5, t="auto"): """ Apply PHATE dimensionality reduction. Args: X (np.ndarray): Input data param_str (str): Parameter string for labeling knn (int): Number of nearest neighbors t (str or int): Diffusion time scale parameter Returns: tuple: (X_reduced, elapsed_time) """ logger.info(f"Applying PHATE with parameters: {param_str}...") # Initialize PHATE phate_operator = phate.PHATE(n_components=2, knn=knn, t=t) # Apply PHATE with timing start_time = time.time() X_reduced = phate_operator.fit_transform(X) elapsed_time = time.time() - start_time logger.info(f"PHATE completed in {elapsed_time:.2f} seconds") return X_reduced, elapsed_time def apply_umap(X, param_str, n_neighbors=15, min_dist=0.1): """ Apply UMAP dimensionality reduction. Args: X (np.ndarray): Input data param_str (str): Parameter string for labeling n_neighbors (int): Number of nearest neighbors min_dist (float): Minimum distance in the embedding Returns: tuple: (X_reduced, elapsed_time) """ logger.info(f"Applying UMAP with parameters: {param_str}...") # Initialize UMAP - set random_state=None to allow n_jobs parallelism umap_reducer = umap.UMAP( n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, random_state=None, # Allow parallelism n_jobs=-1, # Use all available cores ) # Apply UMAP with timing start_time = time.time() X_reduced = umap_reducer.fit_transform(X) elapsed_time = time.time() - start_time logger.info(f"UMAP completed in {elapsed_time:.2f} seconds") return X_reduced, elapsed_time def visualize_comparison(results, y, class_names, dataset_name): """ Create a visualization comparing PHATE and UMAP results with different parameters. Args: results (dict): Dictionary mapping method names to lists of (X_reduced, param_str, elapsed_time) tuples y (np.ndarray): Labels class_names (list): Names of classes dataset_name (str): Name of the dataset """ # Get methods and number of parameter variations for each methods = list(results.keys()) n_methods = len(methods) n_params = max(len(results[method]) for method in methods) # Create figure with a grid of subplots fig, axes = plt.subplots(n_params, n_methods, figsize=(n_methods * 5, n_params * 5)) # If only one row, make axes 2D if n_params == 1: axes = axes.reshape(1, -1) # Convert y to integers for coloring y_int = y.astype(int) if not np.issubdtype(y.dtype, np.integer) else y # Loop through methods (columns) for col, method in enumerate(methods): method_results = results[method] # Loop through parameters for this method (rows) for row, (X_reduced, param_str, _) in enumerate(method_results): ax = axes[row, col] # Plot with different colors for each class scatter = ax.scatter( X_reduced[:, 0], X_reduced[:, 1], c=y_int, cmap="tab10", alpha=0.5, s=2 ) # Set titles if row == 0: ax.set_title(f"{method}", fontsize=14, pad=10) # Add parameter information ax.set_xlabel(param_str, fontsize=10) # Remove ticks for cleaner visualization ax.set_xticks([]) ax.set_yticks([]) # Create a legend handles, labels = scatter.legend_elements(prop="colors") fig.legend( handles, class_names, loc="upper right", title="Classes", bbox_to_anchor=(0.99, 0.99), ) plt.suptitle(f"PHATE vs UMAP - {dataset_name}", fontsize=18, y=0.98) plt.tight_layout() # Save figure output_path = OUTPUT_DIR / f"{dataset_name.lower()}_phate_umap_comparison.png" plt.savefig(output_path, dpi=300, bbox_inches="tight") logger.info(f"Comparison visualization saved to {output_path}") # Also save as SVG for high-quality reproduction svg_path = OUTPUT_DIR / f"{dataset_name.lower()}_phate_umap_comparison.svg" plt.savefig(svg_path, format="svg", bbox_inches="tight") plt.close() # Create a performance comparison table fig, ax = plt.subplots(figsize=(10, 4)) ax.axis("tight") ax.axis("off") # Prepare data for the table table_data = [] for method in methods: for X_reduced, param_str, elapsed_time in results[method]: table_data.append([method, param_str, f"{elapsed_time:.2f}s"]) # Create the table table = ax.table( cellText=table_data, colLabels=["Method", "Parameters", "Computation Time"], loc="center", cellLoc="center", ) table.auto_set_font_size(False) table.set_fontsize(12) table.scale(1.2, 1.5) plt.title(f"Performance Comparison - {dataset_name}", fontsize=16, pad=20) # Save the table table_path = OUTPUT_DIR / f"{dataset_name.lower()}_phate_umap_performance.png" plt.savefig(table_path, dpi=300, bbox_inches="tight") logger.info(f"Performance table saved to {table_path}") plt.close() def main(): """ Main function to run the comparison between PHATE and UMAP. """ # Ensure the output directory exists OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Parameters n_samples = 5000 # Use a smaller subset for faster computation # Define parameter sets for each method phate_params = [ (5, "auto", "knn=5"), (10, "auto", "knn=10"), (20, "auto", "knn=20"), ] umap_params = [ (10, 0.1, "n_neighbors=10"), (20, 0.1, "n_neighbors=20"), (40, 0.1, "n_neighbors=40"), ] # Process both datasets for dataset_name, prefix in [ ("MNIST", "mnist"), ("Fashion-MNIST", "fashion_mnist"), ]: # Load dataset X, y, class_names = load_dataset(prefix) if X is None: continue # Use a subset of samples for faster computation if len(X) > n_samples: logger.info(f"Using {n_samples} samples from {len(X)} total") indices = np.random.choice(len(X), n_samples, replace=False) X = X[indices] y = y[indices] # Results dictionary to store all outputs results = {"PHATE": [], "UMAP": []} # Apply PHATE with different parameters for knn, t, param_str in phate_params: try: X_reduced, elapsed_time = apply_phate(X, param_str, knn=knn, t=t) results["PHATE"].append((X_reduced, param_str, elapsed_time)) except Exception as e: logger.error( f"Error applying PHATE with {param_str} to {dataset_name}: {e}" ) # Apply UMAP with different parameters for n_neighbors, min_dist, param_str in umap_params: try: X_reduced, elapsed_time = apply_umap( X, param_str, n_neighbors=n_neighbors, min_dist=min_dist ) results["UMAP"].append((X_reduced, param_str, elapsed_time)) except Exception as e: logger.error( f"Error applying UMAP with {param_str} to {dataset_name}: {e}" ) # Create visualizations if all(results.values()): visualize_comparison(results, y, class_names, dataset_name) # Print performance comparison logger.info(f"\n{dataset_name} Performance Comparison:") for method, method_results in results.items(): for _, param_str, elapsed_time in method_results: logger.info(f" {method} ({param_str}): {elapsed_time:.2f} seconds") else: logger.error( f"No successful dimension reduction methods for {dataset_name}" ) if __name__ == "__main__": main() .. rst-class:: sphx-glr-timing **Total running time of the script:** (0 minutes 0.293 seconds) .. _sphx_glr_download_auto_examples_compare_phate_umap.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: compare_phate_umap.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: compare_phate_umap.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: compare_phate_umap.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_