Source code for analysis.Dimreduce

import matplotlib.pyplot as plt
from astropy.table import Table
from sklearn.preprocessing import StandardScaler
import umap
from hdbscan import HDBSCAN
from sklearn.mixture import GaussianMixture
import numpy as np
import matplotlib.gridspec as gridspec
from sklearn.manifold import TSNE

[docs]def investigate_umap(
    table_path,
    data_keys,
    label_column,
    labels_name,
    labels_color_map,
    n_neighbors_list=[15, 15, 15],
    min_dist_list=[0.1, 0.3, 1],
    cluster_method=None,  # Accepts 'GMM', 'HDBSCAN', or None
    n_components_gmm=5,
    min_cluster_size_hdbscan=30,
    min_samples_hdbscan=1,
    axis_label_fontsize=18,
    tick_fontsize=18,
    title_fontsize=19,
    legend_fontsize=15
):
    """
    Visualizes UMAP dimensionality reduction results for a high-dimensional dataset and 
    optionally applies clustering (GMM or HDBSCAN) in the reduced space.

    Parameters
    ----------
    table_path : str
        Path to the FITS file containing the dataset.
    data_keys : list of str
        List of column names to use as input features for UMAP.
    label_column : str
        Column name containing original cluster assignments for coloring true label plots.
    labels_name : dict
        Dictionary mapping numerical cluster IDs to string labels (e.g. {1: 'GS/E', 2: 'Splash'}).
    labels_color_map : dict
        Dictionary mapping string labels to matplotlib-compatible color codes.
    n_neighbors_list : list of int, optional
        List of UMAP `n_neighbors` values, one per column of the plot grid.
    min_dist_list : list of float, optional
        List of UMAP `min_dist` values, one per column of the plot grid.
    cluster_method : str or None, optional
        If specified, applies unsupervised clustering in UMAP space. Options:
        - 'GMM': Gaussian Mixture Model clustering (requires `n_components_gmm`).
        - 'HDBSCAN': HDBSCAN clustering (requires `min_cluster_size_hdbscan` and `min_samples_hdbscan`).
        - None: disables clustering, shows only UMAP colored by original labels.
    n_components_gmm : int, optional
        Number of clusters to fit for GMM if `cluster_method='GMM'`. Default is 5.
    min_cluster_size_hdbscan : int, optional
        Minimum cluster size for HDBSCAN. Only used if `cluster_method='HDBSCAN'`.
    min_samples_hdbscan : int, optional
        Minimum samples for HDBSCAN. Only used if `cluster_method='HDBSCAN'`.
    axis_label_fontsize : int, optional
        Font size for axis labels.
    tick_fontsize : int, optional
        Font size for axis tick labels.
    title_fontsize : int, optional
        Font size for row titles.
    legend_fontsize : int, optional
        Font size for legend and text annotations.

    Returns
    -------
    None
        Displays matplotlib figures with UMAP projections and clustering overlays if enabled.
    """
    # Extrct the atropy table from the fits path
    tbl = Table.read(table_path, format='fits')

    # Extract the values from the table depending on the data keys
    X = tbl[data_keys].to_pandas().values

    # Apply standard scaling each of the columns dimensions
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Extract the labels assigned during high dimensional GMM - XD Clustering to help visualize the clusters
    labels = tbl[label_column]
    named_labels = np.array([labels_name[int(i)] for i in labels])
    unique_names = sorted(set(named_labels))

    # Assign the number of graph rows based wether a clustering method is used or not
    n_cols = len(n_neighbors_list)
    n_rows = 2 if cluster_method in ['GMM', 'HDBSCAN'] else 1

    # Plottig the UMAP results
    # Create a grid of subplots
    fig = plt.figure(figsize=(6 * n_cols, 5 * n_rows), constrained_layout=True)
    gs = gridspec.GridSpec(n_rows, n_cols, figure=fig, hspace=0.075)
    axes = np.empty((n_rows, n_cols), dtype=object)

    for idx, (n_n, m_d) in enumerate(zip(n_neighbors_list, min_dist_list)):
        # For all UMAP configurations, apply the UMAP data reduction, obtaining the 2D coordinates
        reducer = umap.UMAP(n_components=2, n_neighbors=n_n, min_dist=m_d, random_state=42)
        X_umap = reducer.fit_transform(X_scaled)

        # Plot the UMAP results colour coded by the GMM labels from the XD clustering - no analysis has been done on the UMAP results at this stage
        ax_top = fig.add_subplot(gs[0, idx])
        axes[0, idx] = ax_top
        for name in unique_names:
            mask = named_labels == name
            ax_top.scatter(X_umap[mask, 0], X_umap[mask, 1],
                           label=name, color=labels_color_map[name], s=10, alpha=0.7)
        ax_top.set_xlabel("UMAP-1", fontsize=axis_label_fontsize)
        ax_top.set_ylabel("UMAP-2", fontsize=axis_label_fontsize)
        ax_top.tick_params(axis='both', labelsize=tick_fontsize)

        # Add UMAP params to top-right of each plot
        ax_top.text(0.99, 0.98, f"n_neighbors={n_n}\nmin_dist={m_d}",
                    transform=ax_top.transAxes, ha='right', va='top',
                    fontsize=legend_fontsize,
                    bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.7))

        # If a clustering method is selected, we apply clustering to the UMAP coordinates and compare the results visually to see the plausibility of the clusters results
        # GMM Clustering
        if cluster_method == 'GMM':
            gmm = GaussianMixture(n_components=n_components_gmm, random_state=9)
            cluster_labels = gmm.fit_predict(X_umap)
        
        # HDBSCAN Clustering
        elif cluster_method == 'HDBSCAN':
            hdb = HDBSCAN(min_cluster_size=min_cluster_size_hdbscan, min_samples=min_samples_hdbscan)
            cluster_labels = hdb.fit_predict(X_umap)
        else:
            cluster_labels = None

        # Plot the clustering results - the color/ labeling of these is `random` each time so no labels are used or assigned - expecially with APOGEE where we obtain poor results
        if cluster_labels is not None:
            ax_bot = fig.add_subplot(gs[1, idx])
            axes[1, idx] = ax_bot
            ax_bot.scatter(X_umap[:, 0], X_umap[:, 1], c=cluster_labels, cmap='tab10', s=10, alpha=0.7)
            ax_bot.set_xlabel("UMAP-1", fontsize=axis_label_fontsize)
            ax_bot.set_ylabel("UMAP-2", fontsize=axis_label_fontsize)
            ax_bot.tick_params(axis='both', labelsize=tick_fontsize)

    # Add titles to the plots
    if cluster_method:
        fig.text(0.5, 0.52, f"{cluster_method} Clustering Labels", ha='center', va='top', fontsize=title_fontsize)
    fig.text(0.5, 1.0, "High Dimensional XD Labels", ha='center', va='bottom', fontsize=title_fontsize)

    # Add XD labels to the top-left the UMAP plot
    handles, labels_ = axes[0, 0].get_legend_handles_labels()
    axes[0, 0].legend(handles, labels_, title='True Label', loc='upper left', fontsize=legend_fontsize)

    # if cluster_method == 'HDBSCAN' add clustering hyperparameters to the top-left of each plot
    if cluster_method == 'HDBSCAN':
        for ax in axes[1, :]:
            ax.text(0.01, 0.98,
                    f"min_cluster_size={min_cluster_size_hdbscan}\nmin_samples={min_samples_hdbscan}",
                    transform=ax.transAxes, ha='left', va='top',
                    fontsize=legend_fontsize,
                    bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.7))

    plt.show()



[docs]def investigate_tsne(
    table_path,
    data_keys,
    perplexities,
    learning_rates,
    label_column='max_gauss',
    labels_name=None,
    labels_color_map=None,
    axis_label_fontsize=14,
    tick_fontsize=12,
    legend_fontsize=10,
    title_fontsize=14
):
    """
    Visualizes t-SNE dimensionality reduction results across multiple configurations.

    Parameters
    ----------
    table_path : str
        Path to the FITS file containing the data table.
    data_keys : list of str
        Column names to use as input features for dimensionality reduction.
    perplexities : list of int
        List of perplexity values for each t-SNE configuration.
    learning_rates : list of float
        List of learning rate values for each t-SNE configuration.
    label_column : str, optional
        Column name representing true GMM cluster labels. Default is 'max_gauss'.
    labels_name : dict, optional
        Mapping from numeric GMM component indices to descriptive cluster names.
    labels_color_map : dict, optional
        Mapping from descriptive cluster names to color codes.
    axis_label_fontsize : int, optional
        Font size for axis labels.
    tick_fontsize : int, optional
        Font size for axis ticks.
    legend_fontsize : int, optional
        Font size for the legend.
    title_fontsize : int, optional
        Font size for plot titles.

    Returns
    -------
    None
    """
    assert len(perplexities) == len(learning_rates), "perplexities and learning_rates must be same length"

    # Load the data from the FITS file 
    tbl = Table.read(table_path, format='fits')

    # Extract the values from the table depending on the data keys
    X = tbl[data_keys].to_pandas().values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Extract the labels assigned during high dimensional GMM - XD Clustering to help visualize the clusters
    labels = tbl[label_column]

    # cross reference the labels with the names and colors provided
    named_labels = np.array([labels_name[int(i)] for i in labels])
    unique_names = sorted(set(named_labels))

    # Plotting the t-SNE results
    fig, axes = plt.subplots(1, len(perplexities), figsize=(6 * len(perplexities), 5))
    if len(perplexities) == 1:
        axes = [axes]

    # Create a grid of subplots
    for idx, (perp, lr) in enumerate(zip(perplexities, learning_rates)):
        # For all t-SNE configurations, apply the t-SNE data reduction, obtaining the 2D coordinates
        tsne = TSNE(n_components=2, perplexity=perp, learning_rate=lr, random_state=42, init='pca', n_iter=1000)
        X_tsne = tsne.fit_transform(X_scaled)

        # Plot the t-SNE results colour coded by the GMM labels from the XD clustering - no analysis is done on the t-SNE results
        ax = axes[idx]
        for name in unique_names:
            mask = named_labels == name
            ax.scatter(
                X_tsne[mask, 0], X_tsne[mask, 1],
                label=name,
                color=labels_color_map[name],
                s=10
            )

        # Set the titles and labels
        ax.set_xlabel("t-SNE-1", fontsize=axis_label_fontsize)
        ax.set_ylabel("t-SNE-2", fontsize=axis_label_fontsize)
        ax.tick_params(axis='both', which='major', labelsize=tick_fontsize)

        # Add configuration parameters to the top-left of each plot
        config_text = f"Perplexity={perp}\nLR={lr}"
        ax.text(0.02, 0.98, config_text, transform=ax.transAxes,
                ha='left', va='top', fontsize=tick_fontsize,
                bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.6))

    # Shared legend in top-right of the last plot
    handles, labels_ = axes[-1].get_legend_handles_labels()
    axes[-1].legend(handles, labels_, title='GMM Component', loc='upper right', fontsize=legend_fontsize)

    plt.tight_layout()
    plt.show()