Note
Go to the end to download the full example code.
End-to-End Pipeline DemoΒΆ
Demonstrates the full CoCo pipeline: 1. Load (Tabular) 2. Preprocess (StandardScaler) 3. Reduce (PCA & UMAP) 4. Report (Comparison)
- Usage:
python examples/demo_pipeline.py
import logging
from pathlib import Path
import numpy as np
import pandas as pd
from coco_pipe.dim_reduction import DimReduction
from coco_pipe.io.dataset import TabularDataset
from coco_pipe.report import from_reductions
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def _sample_metadata(container) -> dict[str, np.ndarray]:
"""Extract observation-aligned metadata columns from a DataContainer."""
n_samples = np.asarray(container.X).shape[0]
metadata = {}
for key, value in (getattr(container, "coords", {}) or {}).items():
value = np.asarray(value)
if key == "obs":
continue
if value.ndim >= 1 and value.shape[0] == n_samples:
metadata[key] = value
return metadata
def create_synthetic_csv(path: Path):
"""Create a dummy CSV file for the demo."""
logger.info(f"Creating synthetic dataset at {path}...")
# 3 Clusters
c1 = np.random.randn(50, 10) + 5
c2 = np.random.randn(50, 10) - 5
c3 = np.random.randn(100, 10)
X = np.vstack([c1, c2, c3])
labels = ["A"] * 50 + ["B"] * 50 + ["C"] * 100
df = pd.DataFrame(X, columns=[f"feat_{i}" for i in range(10)])
df["label"] = labels
# Add extra metadata for interactive labeling demo
df["center"] = np.random.choice(["Site A", "Site B"], size=len(df))
df["batch"] = np.random.choice(["Batch 1", "Batch 2"], size=len(df))
df.to_csv(path, index=False)
def main():
# Setup
data_path = Path("examples/outputs/dummy_data.csv")
data_path.parent.mkdir(parents=True, exist_ok=True)
create_synthetic_csv(data_path)
# 1. Load Data
logger.info("1. Loading Data...")
ds = TabularDataset(
data_path, target_col="label", sep=",", meta_columns=["center", "batch"]
)
container = ds.load()
# 2. Preprocessing (simulate via manual scaling)
logger.info("2. Preprocessing...")
# Ensure X is numeric
if isinstance(container.X, pd.DataFrame):
container.X = container.X.select_dtypes(include=[np.number])
elif hasattr(container.X, "dtype") and container.X.dtype == object:
container.X = (
pd.DataFrame(container.X).select_dtypes(include=[np.number]).values
)
container.X = (container.X - container.X.mean(axis=0)) / container.X.std(axis=0)
# 3. Dimensionality Reduction
logger.info("3. Running Reductions...")
# PCA
pca = DimReduction(method="PCA", n_components=2)
pca_emb = pca.fit_transform(container.X)
# UMAP
umap = DimReduction(method="UMAP", n_components=2, n_neighbors=15)
umap_emb = umap.fit_transform(container.X)
# 4. Generate Comparative Report
logger.info("4. Generating Report...")
# Calculate dummy metrics for demonstration
metrics_data = {
"Trustworthiness": [0.95, 0.88],
"Continuity": [0.90, 0.92],
"Shepard Goodness": [0.85, 0.91],
}
metrics_df = pd.DataFrame(metrics_data, index=["PCA", "UMAP"])
report = from_reductions(
reductions=[pca, umap],
container=container,
embeddings=[pca_emb, umap_emb],
labels=container.y,
metadata=_sample_metadata(container),
title="Pipeline Demo: PCA vs UMAP",
config={"pipeline": "Full Demo", "scaling": "StandardScaler"},
)
# Add comparison section
report.add_comparison(metrics_df, name="Method Comparison")
output_path = Path("examples/outputs/demo_pipeline.html")
report.save(output_path)
logger.info(f"Report saved to {output_path}")
if __name__ == "__main__":
main()
Total running time of the script: (0 minutes 0.416 seconds)