Source code for delnx.pp._size_factors
import numpy as np
from scipy import sparse
from delnx._logging import logger
from delnx._utils import _get_layer
[docs]
def size_factors(adata, method="normed_sum", layer=None, obs_key_added="size_factors"):
"""Compute size factors for (single-cell) RNA-seq normalization.
This function calculates sample/cell-specific normalization factors (size factors)
to account for differences in sequencing depth and technical biases between samples.
The computed size factors can be used to normalize counts for visualization or
as offset terms in statistical models for differential expression analysis.
Parameters
----------
adata : AnnData
Annotated data matrix containing expression data.
method : str, default="normed_sum"
Method to compute size factors:
- "normed_sum": Library size normalization based on the total counts per
sample. Simple and efficient, works with sparse matrices.
- "ratio": DESeq2-style median-of-ratios size factors, robust to differential
expression between samples. Requires dense matrices.
- "poscounts": Positive counts method with geometric mean normalization.
Requires dense matrices.
layer : str, optional
Layer in `adata.layers` to use for size factor calculation. If None, uses
`adata.X`. Should contain raw (unlogged) counts.
obs_key_added : str, default="size_factors"
Key in `adata.obs` where the computed size factors will be stored.
Returns
-------
None
Updates `adata` in place and sets the following field:
- `adata.obs[obs_key_added]`: Size factors for each cell.
Examples
--------
Calculate library size normalization (default):
>>> import scanpy as sc
>>> import delnx as dx
>>> adata = sc.read_h5ad("counts.h5ad")
>>> dx.pp.size_factors(adata, method="normed_sum")
Calculate DESeq2-style median-of-ratios size factors:
>>> # Requires dense matrix
>>> if sparse.issparse(adata.X):
... adata.X = adata.X.toarray()
>>> dx.pp.size_factors(adata, method="ratio", obs_key_added="ratio_factors")
Use size factors for normalization in differential expression analysis:
>>> # Compute DE with size factors as offset
>>> results = dx.tl.de(adata, condition_key="treatment", size_factor_key="size_factors")
Notes
-----
- Size factors are scaled to have a geometric mean of 1.0 across all samples
- Methods "ratio" and "poscounts" require dense matrices; use "normed_sum" for sparse data
- A warning will be raised if size factors cannot be computed for some cells
- Zero or invalid size factors are replaced with a small value (0.001)
"""
# Get expression matrix
X = _get_layer(adata, layer)
if sparse.issparse(X) and method in ["ratio", "poscounts"]:
raise ValueError(
f"The '{method}' method requires a dense matrix. Please convert the sparse matrix to dense format before using this method or use the 'normed_sum' method."
)
if method == "ratio":
log_X = np.log(X)
log_geo_means = np.mean(log_X, 0)
filtered_genes = ~np.isinf(log_geo_means)
# Check if we have any genes left after filtering
if not filtered_genes.any():
raise ValueError(
f"All genes have a least one zero count. Cannot compute size factors with the '{method}' method."
)
log_ratios = log_X[:, filtered_genes] - log_geo_means[filtered_genes]
size_factors = np.exp(np.median(log_ratios, axis=1))
elif method == "normed_sum":
if sparse.issparse(X):
size_factors = np.asarray(X.sum(axis=1)).flatten().astype(float)
else:
size_factors = X.sum(axis=1).astype(float)
elif method == "poscounts":
X = X.astype(float)
log_geometric_means = np.mean(np.log(X + 0.5), axis=1)
X[X == 0] = np.nan
size_factors = np.exp(np.nanmedian(np.log(X) / log_geometric_means.reshape(-1, 1), axis=1))
else:
raise ValueError(f"Unsupported method: {method}")
zero_col = size_factors <= 0 | np.isnan(size_factors)
if zero_col.any() and method == "ratio":
logger.warning(
"Too many zeros to compute size factors with the 'ratio' method. Please use the 'normed_sum' method instead."
)
if zero_col.any():
logger.warning("Some size factors could not be computed due to too many zero values. Setting to 0.001")
size_factors[zero_col] = np.nan
size_factors /= np.exp(np.nanmean(np.log(size_factors)))
size_factors[zero_col] = 0.001
else:
size_factors /= np.exp(np.mean(np.log(size_factors)))
# Store size factors in adata.obs
adata.obs[obs_key_added] = size_factors