"""Differential expression testing for non-count data.
This module provides DE analysis for log-normalized, scaled, or binary data
using logistic regression, ANOVA, or binomial models.
For count data, use :func:`delnx.tl.nb_fit` + :func:`delnx.tl.nb_test`.
For fast cluster markers, use :func:`delnx.tl.rank_de`.
"""
import numpy as np
import pandas as pd
import statsmodels.api as sm
from anndata import AnnData
from delnx._logging import logger
from delnx._utils import _get_layer
from ._de_tests import _run_de
from ._design import build_design, resolve_contrast
from ._jax_tests import _run_batched_de
from ._utils import _check_method_and_data_type, _infer_data_type
[docs]
def de(
adata: AnnData,
condition_key: str | None = None,
formula: str | None = None,
contrast: str | int | None = None,
reference: str | None = None,
covariate_keys: list[str] | None = None,
method: str = "lr",
layer: str | None = None,
multitest_method: str = "fdr_bh",
batch_size: int = 2048,
maxiter: int = 100,
verbose: bool = True,
) -> pd.DataFrame:
"""Differential expression testing for non-count data.
General-purpose DE function for log-normalized, scaled, or binary data.
Builds a design matrix from ``formula`` or ``condition_key`` and tests
a single ``contrast``. For multiple comparisons, call this function
once per contrast or use :func:`grouped` for per-group analysis.
For count data, use :func:`nb_fit` + :func:`nb_test` instead.
For fast cluster markers, use :func:`rank_de`.
Parameters
----------
adata : AnnData
Annotated data object containing expression data and metadata.
condition_key : str | None, default=None
Column name in ``adata.obs`` containing condition labels.
Internally builds ``"~ condition_key"`` for the design matrix.
Mutually exclusive with ``formula``.
formula : str | None, default=None
R-style formula for the design matrix (e.g., ``"~ treatment + batch"``).
Parsed by patsy. Mutually exclusive with ``condition_key``.
contrast : str | int | None, default=None
Coefficient to test. Supports shorthand:
- Level name: ``"drugA"`` (resolved via ``condition_key``).
- Bracket shorthand: ``"treatment[drugA]"`` (for multi-covariate formulas).
- Full patsy name: ``"treatment[T.drugA]"`` (always works).
- Integer index or None (last coefficient).
reference : str | None, default=None
Reference level for categorical conditions. This level becomes
the intercept in treatment coding.
covariate_keys : list[str] | None, default=None
Columns in ``adata.obs`` to include as covariates.
Only used with ``condition_key`` (include covariates in
``formula`` directly).
method : str, default="lr"
Statistical method for testing:
- ``"lr"``: Logistic regression with likelihood ratio test.
Recommended for log-normalized single-cell data.
- ``"anova"``: ANOVA based on linear model.
Recommended for log-normalized or scaled data.
- ``"anova_residual"``: Linear model with residual F-test.
- ``"binomial"``: Binomial GLM likelihood ratio test.
For binary data (e.g., ATAC-seq).
layer : str | None, default=None
Layer in ``adata.layers`` containing expression data.
If None, uses ``adata.X``.
multitest_method : str, default="fdr_bh"
Method for multiple testing correction
(see :func:`statsmodels.stats.multipletests`).
batch_size : int, default=2048
Number of features to process per batch.
maxiter : int, default=100
Maximum number of optimization iterations.
verbose : bool, default=True
Whether to print progress messages.
Returns
-------
pd.DataFrame
Results with columns:
- ``feature``: Gene/feature names
- ``log2fc``: Log2 fold change (coefficient / log(2), clipped to [-10, 10])
- ``coef``: Model coefficient (log scale)
- ``stat``: Test statistic (LR chi-squared or F-statistic)
- ``pval``: Raw p-value
- ``padj``: Adjusted p-value
Examples
--------
Simple condition comparison:
>>> results = dx.tl.de(adata, condition_key="treatment", reference="control",
... contrast="treatment[T.drugA]")
Formula-based with covariates:
>>> results = dx.tl.de(adata, formula="~ treatment + batch",
... contrast="treatment[T.drugA]")
Continuous covariate:
>>> results = dx.tl.de(adata, formula="~ age + sex", contrast="age")
Binomial for binary ATAC data:
>>> results = dx.tl.de(adata, condition_key="treatment", reference="control",
... contrast="treatment[T.drugA]", method="binomial",
... layer="binary")
Notes
-----
For count data (RNA-seq), use :func:`nb_fit` + :func:`nb_test` instead.
For fast cluster markers, use :func:`rank_de`.
"""
# Validate mutual exclusivity
if formula is not None and condition_key is not None:
raise ValueError("Specify either 'formula' or 'condition_key', not both.")
if formula is None and condition_key is None:
raise ValueError("One of 'formula' or 'condition_key' must be specified.")
# Validate supported methods
supported_methods = ("lr", "anova", "anova_residual", "binomial")
if method not in supported_methods:
raise ValueError(
f"Unsupported method '{method}'. Use one of {supported_methods}. "
f"For count data, use nb_fit() + nb_test() instead."
)
# Get expression matrix
X = _get_layer(adata, layer)
# Infer data type
data_type = _infer_data_type(X)
logger.info(f"Inferred data type: {data_type}", verbose=verbose)
# Validate method and data type combinations
_check_method_and_data_type(method, data_type)
# Build design matrix (both paths go through build_design)
design_matrix, column_names = build_design(
adata.obs,
formula=formula,
condition_key=condition_key,
reference=reference,
covariate_keys=covariate_keys,
)
# Resolve contrast to column index
test_idx = resolve_contrast(contrast, column_names, condition_key=condition_key)
# Filter to non-zero features
feature_mask = np.array(X.sum(axis=0) > 0).flatten()
feature_names = adata.var_names[feature_mask].values
X_filtered = X[:, feature_mask]
logger.info(f"Running DE for {np.sum(feature_mask)} features", verbose=verbose)
# Dispatch to backend
if method == "binomial":
results = _run_de(
X=X_filtered,
model_data=pd.DataFrame(), # unused with design_matrix
feature_names=feature_names,
method=method,
backend="statsmodels",
condition_key="", # unused with design_matrix
design_matrix=design_matrix,
test_idx=test_idx,
verbose=verbose,
)
else:
results = _run_batched_de(
X=X_filtered,
model_data=pd.DataFrame(), # unused with design_matrix
feature_names=feature_names,
method=method,
condition_key="", # unused with design_matrix
design_matrix=design_matrix,
test_idx=test_idx,
batch_size=batch_size,
maxiter=maxiter,
verbose=verbose,
)
results["feature"] = results["feature"].astype(str)
# Check results
if len(results) == 0 or results["pval"].isna().all():
raise ValueError(
"Differential expression analysis failed. "
"Check input data or set verbose=True for details."
)
# Compute log2fc from coefficient (coef is on log scale for LR/binomial)
results["log2fc"] = np.clip(results["coef"] / np.log(2), -10.0, 10.0)
# Clip p-values
results["pval"] = np.clip(results["pval"], 1e-50, 1)
# Multiple testing correction
valid = results["pval"].notna()
results["padj"] = np.nan
if valid.sum() > 0:
results.loc[valid, "padj"] = sm.stats.multipletests(
results.loc[valid, "pval"].values, method=multitest_method
)[1]
results = results.sort_values(by=["padj", "coef"]).reset_index(drop=True)
return results[["feature", "log2fc", "coef", "stat", "pval", "padj"]].copy()