Step 3 — Simulate Counts¶

This notebook explains how transcript counts are sampled from the gene parameter table.

The count model¶

For each (cell, gene) pair counts are drawn from a Negative-Binomial (NB) distribution via two-stage (Gamma–Poisson) sampling:

λ  ~  Gamma(shape = θ,  scale = μ · cell_scale / θ)
k  ~  Poisson(λ)

where:

μ is the gene's mean expression in this cell type (from gpar_df)
θ is the NB dispersion (higher θ → lower overdispersion)
cell_scale is a per-cell size factor from nucleus area

The result is a (n_cells × n_genes) integer count matrix stored in sim.count_array.

In [1]:

Copied!





import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from STpuppeteer.simulation import SimulationConfig, SpotlessSimulator

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

CT_PALETTE = {"ct_0": "#49997c", "ct_1": "#1ebecd", "ct_2": "#ae3918"}
GENE_PALETTE = {"ct_0 marker": "#49997c", "ct_1 marker": "#1ebecd",
                "ct_2 marker": "#ae3918", "housekeeping": "#aaaaaa"}
plt.rcParams.update({"figure.dpi": 110, "axes.spines.top": False, "axes.spines.right": False})


def gene_class(gt):
    for k in ["ct_0", "ct_1", "ct_2"]:
        if f"{k}_mkr" in gt:
            return f"{k} marker"
    return "housekeeping"

print("Imports OK")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from STpuppeteer.simulation import SimulationConfig, SpotlessSimulator

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

CT_PALETTE = {"ct_0": "#49997c", "ct_1": "#1ebecd", "ct_2": "#ae3918"}
GENE_PALETTE = {"ct_0 marker": "#49997c", "ct_1 marker": "#1ebecd",
                "ct_2 marker": "#ae3918", "housekeeping": "#aaaaaa"}
plt.rcParams.update({"figure.dpi": 110, "axes.spines.top": False, "axes.spines.right": False})


def gene_class(gt):
    for k in ["ct_0", "ct_1", "ct_2"]:
        if f"{k}_mkr" in gt:
            return f"{k} marker"
    return "housekeeping"

print("Imports OK")

Imports OK

3.1 Run gene parameters + cell layout + counts¶

In [2]:

Copied!





config = SimulationConfig(
    seed=42, canvas_size=200.0,
    n_cells=250, n_celltype=3,
    celltype_proportion=[0.5, 0.3, 0.2],
    continuity=0.25,
    n_genes=120, n_markers=25,
    marker_mu=0.75, marker_cv=0.6,
    silence_mu=0.05, theta_alpha=2.0,
)

sim = SpotlessSimulator(config)
sim.generate_gene_parameters()
sim.initialize_cells()
sim.simulate_counts()

counts   = sim.count_array    # (n_cells, n_genes)
cell_gdf = sim.cell_gdf
gpar_df  = sim.gpar_df

print(f"Count matrix shape: {counts.shape}")
print(f"Total transcripts : {counts.sum():,}")
print(f"Mean per cell     : {counts.sum(axis=1).mean():.1f}")
print(f"Sparsity          : {(counts == 0).mean():.1%}")
config = SimulationConfig(
    seed=42, canvas_size=200.0,
    n_cells=250, n_celltype=3,
    celltype_proportion=[0.5, 0.3, 0.2],
    continuity=0.25,
    n_genes=120, n_markers=25,
    marker_mu=0.75, marker_cv=0.6,
    silence_mu=0.05, theta_alpha=2.0,
)

sim = SpotlessSimulator(config)
sim.generate_gene_parameters()
sim.initialize_cells()
sim.simulate_counts()

counts   = sim.count_array    # (n_cells, n_genes)
cell_gdf = sim.cell_gdf
gpar_df  = sim.gpar_df

print(f"Count matrix shape: {counts.shape}")
print(f"Total transcripts : {counts.sum():,}")
print(f"Mean per cell     : {counts.sum(axis=1).mean():.1f}")
print(f"Sparsity          : {(counts == 0).mean():.1%}")

Count matrix shape: (250, 120)
Total transcripts : 7,551
Mean per cell     : 30.2
Sparsity          : 83.5%

3.2 Count matrix overview¶

Heatmap (log1p) — cells sorted by cell type reveal marker gene blocks
Counts per cell — shows cell-to-cell variability
Counts per gene — long tail is typical of scRNA-seq-like data

In [3]:

Copied!





sort_idx = cell_gdf.sort_values("celltype").index.tolist()
gpar_df["gene_class"] = gpar_df["gene_type"].apply(gene_class)

fig, axs = plt.subplots(1, 3, figsize=(15, 4))

# Heatmap
sns.heatmap(np.log1p(counts[sort_idx, :]), ax=axs[0], cmap="magma",
            xticklabels=False, yticklabels=False,
            cbar_kws={"label": "log1p(count)"})
axs[0].set_title("Count matrix (log1p, cells sorted by type)")
axs[0].set_xlabel("Genes"); axs[0].set_ylabel("Cells")

# Counts per cell
tc_df = pd.DataFrame({"total": counts.sum(axis=1), "celltype": cell_gdf["celltype"].values})
sns.boxplot(data=tc_df, x="celltype", y="total",
            palette=CT_PALETTE, width=0.5, ax=axs[1],
            flierprops={"marker": ".", "markersize": 3})
axs[1].set_title("Total counts per cell")
axs[1].set_xlabel("")

# Counts per gene
axs[2].hist(counts.sum(axis=0), bins=35, color="#555599", alpha=0.8)
axs[2].axvline(counts.sum(axis=0).mean(), color="crimson", ls="--", lw=1.5, label="mean")
axs[2].set_title("Total counts per gene")
axs[2].set_xlabel("Transcript count")
axs[2].legend()

fig.suptitle("Count matrix overview", fontsize=12)
fig.tight_layout()
plt.show()
sort_idx = cell_gdf.sort_values("celltype").index.tolist()
gpar_df["gene_class"] = gpar_df["gene_type"].apply(gene_class)

fig, axs = plt.subplots(1, 3, figsize=(15, 4))

# Heatmap
sns.heatmap(np.log1p(counts[sort_idx, :]), ax=axs[0], cmap="magma",
            xticklabels=False, yticklabels=False,
            cbar_kws={"label": "log1p(count)"})
axs[0].set_title("Count matrix (log1p, cells sorted by type)")
axs[0].set_xlabel("Genes"); axs[0].set_ylabel("Cells")

# Counts per cell
tc_df = pd.DataFrame({"total": counts.sum(axis=1), "celltype": cell_gdf["celltype"].values})
sns.boxplot(data=tc_df, x="celltype", y="total",
            palette=CT_PALETTE, width=0.5, ax=axs[1],
            flierprops={"marker": ".", "markersize": 3})
axs[1].set_title("Total counts per cell")
axs[1].set_xlabel("")

# Counts per gene
axs[2].hist(counts.sum(axis=0), bins=35, color="#555599", alpha=0.8)
axs[2].axvline(counts.sum(axis=0).mean(), color="crimson", ls="--", lw=1.5, label="mean")
axs[2].set_title("Total counts per gene")
axs[2].set_xlabel("Transcript count")
axs[2].legend()

fig.suptitle("Count matrix overview", fontsize=12)
fig.tight_layout()
plt.show()

No description has been provided for this image

3.3 Gene class count distributions¶

Marker genes express highly in their own cell type and are nearly silent in others.
Housekeeping genes show moderate, consistent expression across all cell types.

In [4]:

Copied!





# Extract count per gene separately for ct_0 cells and all cells
ct0_mask   = cell_gdf["celltype"].values == "ct_0"
ct0_counts = counts[ct0_mask, :]  # counts from ct_0 cells only

fig, axs = plt.subplots(1, 2, figsize=(12, 4))

for cls, color in GENE_PALETTE.items():
    mask = gpar_df["gene_class"] == cls
    if mask.sum() == 0:
        continue
    gene_idx = gpar_df.index[mask].tolist()

    # Mean counts across ct_0 cells (all genes of this class)
    means_ct0 = ct0_counts[:, gene_idx].mean(axis=0)
    axs[0].hist(means_ct0, bins=20, color=color, alpha=0.65, density=True, label=cls)

    # Mean counts across all cells
    means_all = counts[:, gene_idx].mean(axis=0)
    axs[1].hist(means_all, bins=20, color=color, alpha=0.65, density=True, label=cls)

axs[0].set_title("Mean counts per gene in ct_0 cells")
axs[0].set_xlabel("Mean count"); axs[0].set_ylabel("Density")
axs[0].legend(fontsize=8, title="Gene class")

axs[1].set_title("Mean counts per gene across all cells")
axs[1].set_xlabel("Mean count")
axs[1].legend(fontsize=8, title="Gene class")

fig.suptitle("Count distributions by gene class", fontsize=12)
fig.tight_layout()
plt.show()
# Extract count per gene separately for ct_0 cells and all cells
ct0_mask   = cell_gdf["celltype"].values == "ct_0"
ct0_counts = counts[ct0_mask, :]  # counts from ct_0 cells only

fig, axs = plt.subplots(1, 2, figsize=(12, 4))

for cls, color in GENE_PALETTE.items():
    mask = gpar_df["gene_class"] == cls
    if mask.sum() == 0:
        continue
    gene_idx = gpar_df.index[mask].tolist()

    # Mean counts across ct_0 cells (all genes of this class)
    means_ct0 = ct0_counts[:, gene_idx].mean(axis=0)
    axs[0].hist(means_ct0, bins=20, color=color, alpha=0.65, density=True, label=cls)

    # Mean counts across all cells
    means_all = counts[:, gene_idx].mean(axis=0)
    axs[1].hist(means_all, bins=20, color=color, alpha=0.65, density=True, label=cls)

axs[0].set_title("Mean counts per gene in ct_0 cells")
axs[0].set_xlabel("Mean count"); axs[0].set_ylabel("Density")
axs[0].legend(fontsize=8, title="Gene class")

axs[1].set_title("Mean counts per gene across all cells")
axs[1].set_xlabel("Mean count")
axs[1].legend(fontsize=8, title="Gene class")

fig.suptitle("Count distributions by gene class", fontsize=12)
fig.tight_layout()
plt.show()

3.4 Effect of `marker_mu` on count distribution¶

Increasing marker_mu raises the mean of the marker gene prior, directly lifting total counts per cell.

In [5]:

Copied!





mu_values = [0.2, 0.5, 1.0, 2.5]
fig, axs  = plt.subplots(1, len(mu_values), figsize=(14, 3.5), sharey=False)

for ax, mu in zip(axs, mu_values):
    cfg = SimulationConfig(
        seed=1, n_cells=200, n_celltype=3, n_genes=60, n_markers=20,
        marker_mu=mu, marker_cv=0.5, silence_mu=0.02, continuity=0.2,
    )
    s = SpotlessSimulator(cfg)
    s.generate_gene_parameters()
    s.initialize_cells()
    s.simulate_counts()

    totals = s.count_array.sum(axis=1)
    ct_labels = s.cell_gdf["celltype"].values
    for ct, color in CT_PALETTE.items():
        mask = ct_labels == ct
        if mask.any():
            ax.hist(totals[mask], bins=20, color=color, alpha=0.6, label=ct)
    ax.set_title(f"marker_mu = {mu}")
    ax.set_xlabel("Total counts/cell")

axs[0].set_ylabel("Cells")
axs[-1].legend(fontsize=8, title="Cell type")
fig.suptitle("marker_mu — higher = more transcripts per cell", fontsize=12, y=1.02)
fig.tight_layout()
plt.show()
mu_values = [0.2, 0.5, 1.0, 2.5]
fig, axs  = plt.subplots(1, len(mu_values), figsize=(14, 3.5), sharey=False)

for ax, mu in zip(axs, mu_values):
    cfg = SimulationConfig(
        seed=1, n_cells=200, n_celltype=3, n_genes=60, n_markers=20,
        marker_mu=mu, marker_cv=0.5, silence_mu=0.02, continuity=0.2,
    )
    s = SpotlessSimulator(cfg)
    s.generate_gene_parameters()
    s.initialize_cells()
    s.simulate_counts()

    totals = s.count_array.sum(axis=1)
    ct_labels = s.cell_gdf["celltype"].values
    for ct, color in CT_PALETTE.items():
        mask = ct_labels == ct
        if mask.any():
            ax.hist(totals[mask], bins=20, color=color, alpha=0.6, label=ct)
    ax.set_title(f"marker_mu = {mu}")
    ax.set_xlabel("Total counts/cell")

axs[0].set_ylabel("Cells")
axs[-1].legend(fontsize=8, title="Cell type")
fig.suptitle("marker_mu — higher = more transcripts per cell", fontsize=12, y=1.02)
fig.tight_layout()
plt.show()

/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(

3.5 Effect of `theta_alpha` on overdispersion¶

theta_alpha controls the Gamma prior on the NB dispersion θ.
Lower theta_alpha → prior mass at small θ → more overdispersed (wider) count distributions.

In [6]:

Copied!





theta_alphas = [0.5, 1.0, 5.0, 20.0]
fig, axs     = plt.subplots(1, len(theta_alphas), figsize=(14, 3.5), sharey=False)

for ax, ta in zip(axs, theta_alphas):
    cfg = SimulationConfig(
        seed=2, n_cells=150, n_celltype=3, n_genes=60, n_markers=20,
        marker_mu=1.0, theta_alpha=ta, theta_rate=1.0, continuity=0.2,
    )
    s = SpotlessSimulator(cfg)
    s.generate_gene_parameters()
    s.initialize_cells()
    s.simulate_counts()

    # Log(variance/mean) per gene — index of dispersion
    gene_mean = s.count_array.mean(axis=0)
    gene_var  = s.count_array.var(axis=0)
    disp_idx  = np.where(gene_mean > 0, gene_var / gene_mean, 0)

    ax.hist(np.log1p(disp_idx), bins=25, color="#dd7733", alpha=0.85, edgecolor="white")
    ax.set_title(f"theta_alpha = {ta}\nmedian Var/Mean = {np.median(disp_idx):.2f}")
    ax.set_xlabel("log(1 + Var/Mean) per gene")

axs[0].set_ylabel("Genes")
fig.suptitle("theta_alpha — lower = more overdispersion (higher Var/Mean)", fontsize=12, y=1.02)
fig.tight_layout()
plt.show()
theta_alphas = [0.5, 1.0, 5.0, 20.0]
fig, axs     = plt.subplots(1, len(theta_alphas), figsize=(14, 3.5), sharey=False)

for ax, ta in zip(axs, theta_alphas):
    cfg = SimulationConfig(
        seed=2, n_cells=150, n_celltype=3, n_genes=60, n_markers=20,
        marker_mu=1.0, theta_alpha=ta, theta_rate=1.0, continuity=0.2,
    )
    s = SpotlessSimulator(cfg)
    s.generate_gene_parameters()
    s.initialize_cells()
    s.simulate_counts()

    # Log(variance/mean) per gene — index of dispersion
    gene_mean = s.count_array.mean(axis=0)
    gene_var  = s.count_array.var(axis=0)
    disp_idx  = np.where(gene_mean > 0, gene_var / gene_mean, 0)

    ax.hist(np.log1p(disp_idx), bins=25, color="#dd7733", alpha=0.85, edgecolor="white")
    ax.set_title(f"theta_alpha = {ta}\nmedian Var/Mean = {np.median(disp_idx):.2f}")
    ax.set_xlabel("log(1 + Var/Mean) per gene")

axs[0].set_ylabel("Genes")
fig.suptitle("theta_alpha — lower = more overdispersion (higher Var/Mean)", fontsize=12, y=1.02)
fig.tight_layout()
plt.show()

/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(
/tmp/ipykernel_3686117/2924577456.py:17: RuntimeWarning: invalid value encountered in divide
  disp_idx  = np.where(gene_mean > 0, gene_var / gene_mean, 0)
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(
/work/PRTNR/CHUV/DIR/rgottar1/single_cell_all/users/jsun1/STpuppeteer/src/STpuppeteer/simulation/config.py:343: UserWarning: Only 0 housekeeping genes will be simulated. Consider increasing n_genes or decreasing n_markers.
  warnings.warn(

Summary¶

Parameter	What it controls	Typical range
`marker_mu`	Signal level — mean marker gene expression	0.3 – 2.0
`marker_cv`	Gene-to-gene variability within the marker class	0.3 – 1.0
`theta_alpha`	Overdispersion — lower = noisier counts	0.5 – 5.0
`n_markers`	How many cell-type-specific genes per type	10 – 100

Next: 04_simulate_transcripts.ipynb — how counts become spatially-placed transcripts with leakage.