Step 3 — Simulate Counts¶
This notebook explains how transcript counts are sampled from the gene parameter table.
The count model¶
For each (cell, gene) pair counts are drawn from a Negative-Binomial (NB) distribution via two-stage (Gamma–Poisson) sampling:
λ ~ Gamma(shape = θ, scale = μ · cell_scale / θ)
k ~ Poisson(λ)
where:
- μ is the gene's mean expression in this cell type (from
gpar_df) - θ is the NB dispersion (higher θ → lower overdispersion)
- cell_scale is a per-cell size factor from nucleus area
The result is a (n_cells × n_genes) integer count matrix stored in sim.count_array.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from STpuppeteer.simulation import SimulationConfig, SpotlessSimulator
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
CT_PALETTE = {"ct_0": "#49997c", "ct_1": "#1ebecd", "ct_2": "#ae3918"}
GENE_PALETTE = {"ct_0 marker": "#49997c", "ct_1 marker": "#1ebecd",
"ct_2 marker": "#ae3918", "housekeeping": "#aaaaaa"}
plt.rcParams.update({"figure.dpi": 110, "axes.spines.top": False, "axes.spines.right": False})
def gene_class(gt):
for k in ["ct_0", "ct_1", "ct_2"]:
if f"{k}_mkr" in gt:
return f"{k} marker"
return "housekeeping"
print("Imports OK")
Imports OK
3.1 Run gene parameters + cell layout + counts¶
config = SimulationConfig(
seed=42, canvas_size=200.0,
n_cells=250, n_celltype=3,
celltype_proportion=[0.5, 0.3, 0.2],
continuity=0.25,
n_genes=120, n_markers=25,
marker_mu=0.75, marker_cv=0.6,
silence_mu=0.05, theta_alpha=2.0,
)
sim = SpotlessSimulator(config)
sim.generate_gene_parameters()
sim.initialize_cells()
sim.simulate_counts()
counts = sim.count_array # (n_cells, n_genes)
cell_gdf = sim.cell_gdf
gpar_df = sim.gpar_df
print(f"Count matrix shape: {counts.shape}")
print(f"Total transcripts : {counts.sum():,}")
print(f"Mean per cell : {counts.sum(axis=1).mean():.1f}")
print(f"Sparsity : {(counts == 0).mean():.1%}")
Count matrix shape: (250, 120) Total transcripts : 7,551 Mean per cell : 30.2 Sparsity : 83.5%
3.2 Count matrix overview¶
- Heatmap (log1p) — cells sorted by cell type reveal marker gene blocks
- Counts per cell — shows cell-to-cell variability
- Counts per gene — long tail is typical of scRNA-seq-like data
sort_idx = cell_gdf.sort_values("celltype").index.tolist()
gpar_df["gene_class"] = gpar_df["gene_type"].apply(gene_class)
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
# Heatmap
sns.heatmap(np.log1p(counts[sort_idx, :]), ax=axs[0], cmap="magma",
xticklabels=False, yticklabels=False,
cbar_kws={"label": "log1p(count)"})
axs[0].set_title("Count matrix (log1p, cells sorted by type)")
axs[0].set_xlabel("Genes"); axs[0].set_ylabel("Cells")
# Counts per cell
tc_df = pd.DataFrame({"total": counts.sum(axis=1), "celltype": cell_gdf["celltype"].values})
sns.boxplot(data=tc_df, x="celltype", y="total",
palette=CT_PALETTE, width=0.5, ax=axs[1],
flierprops={"marker": ".", "markersize": 3})
axs[1].set_title("Total counts per cell")
axs[1].set_xlabel("")
# Counts per gene
axs[2].hist(counts.sum(axis=0), bins=35, color="#555599", alpha=0.8)
axs[2].axvline(counts.sum(axis=0).mean(), color="crimson", ls="--", lw=1.5, label="mean")
axs[2].set_title("Total counts per gene")
axs[2].set_xlabel("Transcript count")
axs[2].legend()
fig.suptitle("Count matrix overview", fontsize=12)
fig.tight_layout()
plt.show()
3.3 Gene class count distributions¶
Marker genes express highly in their own cell type and are nearly silent in others.
Housekeeping genes show moderate, consistent expression across all cell types.
# Extract count per gene separately for ct_0 cells and all cells
ct0_mask = cell_gdf["celltype"].values == "ct_0"
ct0_counts = counts[ct0_mask, :] # counts from ct_0 cells only
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
for cls, color in GENE_PALETTE.items():
mask = gpar_df["gene_class"] == cls
if mask.sum() == 0:
continue
gene_idx = gpar_df.index[mask].tolist()
# Mean counts across ct_0 cells (all genes of this class)
means_ct0 = ct0_counts[:, gene_idx].mean(axis=0)
axs[0].hist(means_ct0, bins=20, color=color, alpha=0.65, density=True, label=cls)
# Mean counts across all cells
means_all = counts[:, gene_idx].mean(axis=0)
axs[1].hist(means_all, bins=20, color=color, alpha=0.65, density=True, label=cls)
axs[0].set_title("Mean counts per gene in ct_0 cells")
axs[0].set_xlabel("Mean count"); axs[0].set_ylabel("Density")
axs[0].legend(fontsize=8, title="Gene class")
axs[1].set_title("Mean counts per gene across all cells")
axs[1].set_xlabel("Mean count")
axs[1].legend(fontsize=8, title="Gene class")
fig.suptitle("Count distributions by gene class", fontsize=12)
fig.tight_layout()
plt.show()
3.4 Effect of marker_mu on count distribution¶
Increasing marker_mu raises the mean of the marker gene prior, directly lifting total counts per cell.
mu_values = [0.2, 0.5, 1.0, 2.5]
fig, axs = plt.subplots(1, len(mu_values), figsize=(14, 3.5), sharey=False)
for ax, mu in zip(axs, mu_values):
cfg = SimulationConfig(
seed=1, n_cells=200, n_celltype=3, n_genes=60, n_markers=20,
marker_mu=mu, marker_cv=0.5, silence_mu=0.02, continuity=0.2,
)
s = SpotlessSimulator(cfg)
s.generate_gene_parameters()
s.initialize_cells()
s.simulate_counts()
totals = s.count_array.sum(axis=1)
ct_labels = s.cell_gdf["celltype"].values
for ct, color in CT_PALETTE.items():
mask = ct_labels == ct
if mask.any():
ax.hist(totals[mask], bins=20, color=color, alpha=0.6, label=ct)
ax.set_title(f"marker_mu = {mu}")
ax.set_xlabel("Total counts/cell")
axs[0].set_ylabel("Cells")
axs[-1].legend(fontsize=8, title="Cell type")
fig.suptitle("marker_mu — higher = more transcripts per cell", fontsize=12, y=1.02)
fig.tight_layout()
plt.show()
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn( /work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn( /work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn( /work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn(
3.5 Effect of theta_alpha on overdispersion¶
theta_alpha controls the Gamma prior on the NB dispersion θ.
Lower theta_alpha → prior mass at small θ → more overdispersed (wider) count distributions.
theta_alphas = [0.5, 1.0, 5.0, 20.0]
fig, axs = plt.subplots(1, len(theta_alphas), figsize=(14, 3.5), sharey=False)
for ax, ta in zip(axs, theta_alphas):
cfg = SimulationConfig(
seed=2, n_cells=150, n_celltype=3, n_genes=60, n_markers=20,
marker_mu=1.0, theta_alpha=ta, theta_rate=1.0, continuity=0.2,
)
s = SpotlessSimulator(cfg)
s.generate_gene_parameters()
s.initialize_cells()
s.simulate_counts()
# Log(variance/mean) per gene — index of dispersion
gene_mean = s.count_array.mean(axis=0)
gene_var = s.count_array.var(axis=0)
disp_idx = np.where(gene_mean > 0, gene_var / gene_mean, 0)
ax.hist(np.log1p(disp_idx), bins=25, color="#dd7733", alpha=0.85, edgecolor="white")
ax.set_title(f"theta_alpha = {ta}\nmedian Var/Mean = {np.median(disp_idx):.2f}")
ax.set_xlabel("log(1 + Var/Mean) per gene")
axs[0].set_ylabel("Genes")
fig.suptitle("theta_alpha — lower = more overdispersion (higher Var/Mean)", fontsize=12, y=1.02)
fig.tight_layout()
plt.show()
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn( /tmp/ipykernel_3686117/2924577456.py:17: RuntimeWarning: invalid value encountered in divide disp_idx = np.where(gene_mean > 0, gene_var / gene_mean, 0) /work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn( /work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn( /work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers. warnings.warn(
Summary¶
| Parameter | What it controls | Typical range |
|---|---|---|
marker_mu |
Signal level — mean marker gene expression | 0.3 – 2.0 |
marker_cv |
Gene-to-gene variability within the marker class | 0.3 – 1.0 |
theta_alpha |
Overdispersion — lower = noisier counts | 0.5 – 5.0 |
n_markers |
How many cell-type-specific genes per type | 10 – 100 |
Next: 04_simulate_transcripts.ipynb — how counts become spatially-placed transcripts with leakage.