Iterative Balancing Demo¶

Demonstrates the iterative balancing process for datasets.

Demo: Iterative Undersampling with Randomness
============================================
Original Distribution: {'A': 600, 'B': 50}

--- Balancing with random_state=1 ---
Balanced Counts: {'B': 50, 'A': 50}
Class A sample size: 50
First 5 IDs selected for A: [589 523 393 105 420]

--- Balancing with random_state=2 ---
Balanced Counts: {'A': 50, 'B': 50}
Class A sample size: 50
First 5 IDs selected for A: [315 296 342 295 144]

--- Balancing with random_state=3 ---
Balanced Counts: {'A': 50, 'B': 50}
Class A sample size: 50
First 5 IDs selected for A: [164 178 399 231 265]

--- Overlap Analysis ---
Overlap between Run 1 and Run 2 (Class A): 5 / 50
Overlap between Run 1 and Run 3 (Class A): 5 / 50

SUCCESS: Different seeds produced different subsets!

import numpy as np
import pandas as pd

from coco_pipe.io import DataContainer


def run_demo():
    print("Demo: Iterative Undersampling with Randomness")
    print("============================================")

    # 1. create imbalanced data: 600 of class 'A', 50 of class 'B'
    n_a = 600
    n_b = 50

    n_total = n_a + n_b
    X = np.random.randn(n_total, 2)
    y = np.array(["A"] * n_a + ["B"] * n_b)
    ids = np.arange(n_total)

    container = DataContainer(X=X, dims=("obs", "feat"), y=y, ids=ids)

    print(f"Original Distribution: {pd.Series(y).value_counts().to_dict()}")

    # 2. Run balance multiple times with different seeds
    seeds = [1, 2, 3]

    # Store indices selected for the majority class 'A' to compare overlaps
    selected_indices_a = []

    for seed in seeds:
        print(f"\n--- Balancing with random_state={seed} ---")
        balanced = container.balance(
            target="y", strategy="undersample", random_state=seed
        )

        # Check counts
        counts = pd.Series(balanced.y).value_counts().to_dict()
        print(f"Balanced Counts: {counts}")

        # Get indices of class A in this balanced set
        # We can find them by looking at IDs or matching logic, but here we can
        # just inspect the subset
        # Since ids are just 0..649, let's see which of the first 600 were picked
        subset_ids = balanced.ids
        class_a_mask = balanced.y == "A"
        ids_a = subset_ids[class_a_mask]

        print(f"Class A sample size: {len(ids_a)}")
        print(f"First 5 IDs selected for A: {ids_a[:5]}")

        selected_indices_a.append(set(ids_a))

    # 3. Analyze overlap
    set1, set2, set3 = selected_indices_a
    overlap_1_2 = len(set1.intersection(set2))
    overlap_1_3 = len(set1.intersection(set3))

    print("\n--- Overlap Analysis ---")
    print(f"Overlap between Run 1 and Run 2 (Class A): {overlap_1_2} / 50")
    print(f"Overlap between Run 1 and Run 3 (Class A): {overlap_1_3} / 50")

    if overlap_1_2 < 50 and overlap_1_3 < 50:
        print("\nSUCCESS: Different seeds produced different subsets!")
    else:
        print("\nFAILURE: Subsets are identical despite different seeds.")


if __name__ == "__main__":
    run_demo()

Total running time of the script: (0 minutes 0.017 seconds)

Gallery generated by Sphinx-Gallery