Source code for delnx.tl._design

"""Design matrix construction from formulas or condition keys.

Shared helper used by both :func:`nb_fit` and :func:`de` to build design
matrices with consistent encoding.
"""

import numpy as np
import pandas as pd
import patsy



[docs]
def build_design(
    obs: pd.DataFrame,
    formula: str | None = None,
    condition_key: str | None = None,
    reference: str | None = None,
    covariate_keys: list[str] | None = None,
) -> tuple[np.ndarray, list[str]]:
    """Build a design matrix from a formula or condition_key.

    Parameters
    ----------
    obs : pd.DataFrame
        Observation metadata (typically ``adata.obs``).
    formula : str | None
        R-style formula (e.g., ``"~ treatment + batch"``).
        Mutually exclusive with ``condition_key``.
    condition_key : str | None
        Column in ``obs`` for condition labels. Builds a formula internally.
        Mutually exclusive with ``formula``.
    reference : str | None
        Reference level for the condition. Used to reorder categorical levels
        so the reference becomes the intercept. Only used with ``condition_key``
        or when ``formula`` contains a simple term matching a categorical column.
    covariate_keys : list[str] | None
        Additional columns to include as covariates. Only used with
        ``condition_key`` (ignored when ``formula`` is provided, since
        covariates should be part of the formula).

    Returns
    -------
    tuple[np.ndarray, list[str]]
        Design matrix of shape ``(n_obs, n_coef)`` and list of column names.

    Raises
    ------
    ValueError
        If both ``formula`` and ``condition_key`` are specified, or neither is.
    """
    if formula is not None and condition_key is not None:
        raise ValueError("Specify either 'formula' or 'condition_key', not both.")
    if formula is None and condition_key is None:
        raise ValueError("One of 'formula' or 'condition_key' must be specified.")

    if condition_key is not None:
        # Build formula from condition_key + covariates
        if condition_key not in obs.columns:
            raise ValueError(f"Condition key '{condition_key}' not found in obs")
        formula = f"~ {condition_key}"
        if covariate_keys:
            for cov in covariate_keys:
                if cov not in obs.columns:
                    raise ValueError(f"Covariate '{cov}' not found in obs")
            formula += " + " + " + ".join(covariate_keys)

    # Prepare obs copy with reference level handling
    obs_copy = obs.copy()
    if reference is not None and condition_key is not None:
        col = obs_copy[condition_key]
        unique_vals = col.unique() if not hasattr(col, "cat") else col.cat.categories.tolist()
        if reference not in list(unique_vals):
            raise ValueError(
                f"Reference '{reference}' not found in '{condition_key}'. "
                f"Available levels: {list(unique_vals)}"
            )
        # Reorder categories so reference is first (becomes intercept in Treatment coding)
        ordered = [reference] + [v for v in unique_vals if v != reference]
        obs_copy[condition_key] = pd.Categorical(col, categories=ordered)

    # Build design matrix via patsy
    dm = patsy.dmatrix(formula, obs_copy, return_type="dataframe")
    design_matrix = np.asarray(dm, dtype=np.float64)
    column_names = list(dm.columns)

    return design_matrix, column_names



import re

_BRACKET_RE = re.compile(r"^(.+)\[(.+)\]$")


def resolve_contrast(
    contrast: str | int | None,
    column_names: list[str],
    condition_key: str | None = None,
) -> int:
    """Resolve a contrast specification to a column index.

    Supports several shorthand forms so users don't need to write full
    patsy treatment-coding names:

    1. Full patsy name: ``"treatment[T.drugA]"`` — exact match.
    2. Bracket shorthand: ``"treatment[drugA]"`` — resolves to
       ``"treatment[T.drugA]"``.
    3. Bare level with ``condition_key``: ``"drugA"`` — resolves to
       ``"treatment[T.drugA]"`` when ``condition_key="treatment"``.
    4. Bare level scan: ``"drugA"`` — if exactly one column ends with
       ``[T.drugA]``, use it.

    Parameters
    ----------
    contrast : str | int | None
        Contrast to resolve. ``None`` uses the last coefficient.
    column_names : list[str]
        Design matrix column names (from :func:`build_design`).
    condition_key : str | None
        Condition key, used for bare-level resolution.

    Returns
    -------
    int
        Column index into the design matrix.
    """
    n_coef = len(column_names)
    if contrast is None:
        return n_coef - 1
    if isinstance(contrast, int):
        if contrast < 0 or contrast >= n_coef:
            raise IndexError(f"Contrast index {contrast} out of range for {n_coef} coefficients.")
        return contrast
    if not isinstance(contrast, str):
        raise NotImplementedError("Custom contrast vectors are not yet supported.")

    # 1. Exact match
    if contrast in column_names:
        return column_names.index(contrast)

    # 2. Bracket shorthand: "key[level]" → "key[T.level]"
    #    Also handles interaction terms: "a[x]:b[y]" → "a[T.x]:b[T.y]"
    if "[" in contrast:
        segments = contrast.split(":")
        resolved = []
        for seg in segments:
            m = _BRACKET_RE.match(seg)
            if m and not m.group(2).startswith("T."):
                resolved.append(f"{m.group(1)}[T.{m.group(2)}]")
            else:
                resolved.append(seg)
        candidate = ":".join(resolved)
        if candidate in column_names:
            return column_names.index(candidate)

    # 3. Bare level + condition_key: "level" → "condition_key[T.level]"
    if condition_key is not None:
        candidate = f"{condition_key}[T.{contrast}]"
        if candidate in column_names:
            return column_names.index(candidate)

    # 4. Bare level suffix scan: find unique "*[T.level]"
    suffix = f"[T.{contrast}]"
    matches = [name for name in column_names if name.endswith(suffix)]
    if len(matches) == 1:
        return column_names.index(matches[0])
    if len(matches) > 1:
        raise ValueError(
            f"Contrast '{contrast}' is ambiguous — matches: {matches}. "
            f"Use bracket syntax to disambiguate, e.g. '{matches[0].split('[')[0]}[{contrast}]'."
        )

    # 5. Nothing matched
    raise ValueError(
        f"Contrast '{contrast}' not found in design columns: {column_names}. "
        f"Use one of the column names directly, or a level value from the condition variable."
    )


def parse_contrast_vector(
    contrast: str | list | np.ndarray | None,
    column_names: list[str],
    condition_key: str | None = None,
) -> np.ndarray | None:
    """Parse a contrast into a numeric vector, or return None for single-column contrasts.

    Handles:

    - List/array: ``[0, 1, -1]`` — validated and returned directly.
    - String formula: ``"drugA - drugB"`` — parsed into a numeric vector
      by resolving each term to a column index.
    - Single-column strings/int/None: returns ``None`` (caller should use
      :func:`resolve_contrast` instead).

    Parameters
    ----------
    contrast
        Contrast specification.
    column_names : list[str]
        Design matrix column names.
    condition_key : str | None
        Condition key for resolving bare level names.

    Returns
    -------
    np.ndarray | None
        Numeric contrast vector of length ``len(column_names)``, or None
        if the contrast is a single-column specification.
    """
    n_coef = len(column_names)

    # List or array → validate and return
    if isinstance(contrast, (list, np.ndarray)):
        vec = np.asarray(contrast, dtype=np.float64)
        if vec.ndim != 1 or len(vec) != n_coef:
            raise ValueError(
                f"Contrast vector length {len(vec)} doesn't match {n_coef} coefficients. "
                f"Design columns: {column_names}"
            )
        return vec

    if not isinstance(contrast, str):
        return None

    # Check if this is a formula (contains + or - outside brackets)
    # Simple heuristic: strip all bracket content, check for operators
    stripped = re.sub(r"\[[^\]]*\]", "", contrast)
    if "+" not in stripped and "-" not in stripped:
        return None

    # Parse formula: split on + and - while preserving the operator
    vec = np.zeros(n_coef, dtype=np.float64)
    # Tokenize: split into (sign, term) pairs
    # Prepend '+' if formula doesn't start with '-'
    formula = contrast.strip()
    if not formula.startswith("-"):
        formula = "+" + formula

    tokens = re.findall(r"([+-])\s*([^+-]+)", formula)
    if not tokens:
        return None

    for sign_str, term in tokens:
        term = term.strip()
        if not term:
            continue
        sign = 1.0 if sign_str == "+" else -1.0

        # Check for coefficient: "2*drugA" or "0.5 * drugA"
        coef_match = re.match(r"^([\d.]+)\s*\*\s*(.+)$", term)
        if coef_match:
            coef_val = float(coef_match.group(1))
            term = coef_match.group(2).strip()
        else:
            coef_val = 1.0

        idx = resolve_contrast(term, column_names, condition_key=condition_key)
        vec[idx] += sign * coef_val

    return vec