This article shows the plot grammar that Phase 18 simulation reports should use before broad simulation grids are advertised as evidence. It is a display contract, not a final result report. The small tables below are illustrative fixtures that mimic the columns a real simulation summary should carry.
Simulation figures should answer the same reader questions across continuous, proportion, count, and meta-analysis examples:
- Does the estimator recover the truth without important bias?
- How large is the error, measured by RMSE?
- Do interval procedures cover at the advertised rate?
- Does power increase under the intended effect?
- How often do fits converge, hit boundaries, warn, or fail?
- How much runtime does each surface require?
library(ggplot2)
theme_sim_grammar <- function() {
theme_minimal(base_size = 11) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title.position = "plot",
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(colour = "grey30", lineheight = 1.05),
strip.text = element_text(face = "bold"),
legend.position = "bottom",
plot.margin = margin(6, 10, 12, 6)
)
}
surface_palette <- c(
"Gaussian location-scale" = "#0072B2",
"Proportion" = "#009E73",
"Count" = "#D55E00",
"Meta-analysis" = "#CC79A7"
)
surface_levels <- names(surface_palette)
surface_facet_labels <- c(
"Gaussian location-scale" = "Gaussian\nlocation-scale",
"Proportion" = "Proportion",
"Count" = "Count",
"Meta-analysis" = "Meta-analysis"
)
estimand_display_labels <- c(
"beta_x" = "beta_x",
"sigma" = "sigma",
"sd_group" = "sd(group)",
"rho12" = "rho12"
)Bias and RMSE
Bias and RMSE should be plotted beside the estimand. A single average
across all parameters hides the distinction between fixed effects,
residual scale, random-effect SDs, and residual correlation
rho12.
accuracy_summary <- data.frame(
surface = rep(surface_levels, each = 4),
estimand = rep(c("beta_x", "sigma", "sd_group", "rho12"), 4),
bias = c(
0.01, -0.02, 0.03, 0.00,
0.00, 0.01, 0.02, NA,
0.02, NA, 0.06, NA,
-0.01, 0.02, NA, 0.03
),
rmse = c(
0.09, 0.12, 0.18, 0.16,
0.07, 0.09, 0.15, NA,
0.12, NA, 0.22, NA,
0.08, 0.11, NA, 0.14
),
n_replicate = c(
300, 300, 260, 280,
300, 300, 250, NA,
280, NA, 220, NA,
300, 300, NA, 260
)
)
accuracy_summary$surface <- factor(
accuracy_summary$surface,
levels = surface_levels
)
accuracy_summary$error_sd <- sqrt(
pmax(accuracy_summary$rmse^2 - accuracy_summary$bias^2, 0)
)
accuracy_summary$bias_mcse <- with(
accuracy_summary,
error_sd / sqrt(n_replicate)
)
accuracy_summary$rmse_mcse <- with(
accuracy_summary,
rmse / sqrt(2 * n_replicate)
)
accuracy_summary$bias_low <- accuracy_summary$bias -
1.96 * accuracy_summary$bias_mcse
accuracy_summary$bias_high <- accuracy_summary$bias +
1.96 * accuracy_summary$bias_mcse
accuracy_summary$rmse_low <- pmax(
0,
accuracy_summary$rmse - 1.96 * accuracy_summary$rmse_mcse
)
accuracy_summary$rmse_high <- accuracy_summary$rmse +
1.96 * accuracy_summary$rmse_mcse
accuracy_summary$estimand <- factor(
accuracy_summary$estimand,
levels = rev(c("beta_x", "sigma", "sd_group", "rho12"))
)
display_replicates <- 49
accuracy_replicates <- do.call(
rbind,
lapply(seq_len(nrow(accuracy_summary)), function(i) {
row <- accuracy_summary[i, ]
if (is.na(row$bias) || is.na(row$rmse)) {
return(NULL)
}
data.frame(
surface = row$surface,
estimand = row$estimand,
error = row$bias + qnorm(ppoints(display_replicates)) * row$error_sd
)
})
)
accuracy_replicates$surface <- factor(
accuracy_replicates$surface,
levels = surface_levels
)
accuracy_replicates$estimand <- factor(
accuracy_replicates$estimand,
levels = levels(accuracy_summary$estimand)
)
set.seed(20260519)
ggplot(accuracy_replicates, aes(error, estimand)) +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey55") +
geom_violin(
aes(
fill = surface
),
width = 0.62,
alpha = 0.16,
colour = NA,
scale = "width",
trim = FALSE
) +
geom_point(
aes(colour = surface),
position = position_jitter(height = 0.05, width = 0),
alpha = 0.22,
size = 0.65,
show.legend = FALSE
) +
geom_segment(
data = accuracy_summary,
aes(
x = bias_low,
xend = bias_high,
y = estimand,
yend = estimand,
colour = surface
),
linewidth = 0.85,
na.rm = TRUE
) +
geom_point(
data = accuracy_summary,
aes(bias, estimand, colour = surface),
size = 2.4,
na.rm = TRUE
) +
facet_wrap(
~surface,
nrow = 1,
labeller = labeller(surface = as_labeller(surface_facet_labels))
) +
scale_colour_manual(values = surface_palette, drop = FALSE) +
scale_fill_manual(values = surface_palette, drop = FALSE) +
labs(
title = "Bias displays should show replicate-level error",
subtitle = paste(
"Clouds and faint dots are replicate errors;",
"large points and bars are mean bias with 95% MCSE"
),
x = "Estimate minus truth",
y = NULL,
colour = "Surface"
) +
scale_y_discrete(labels = estimand_display_labels) +
guides(colour = "none", fill = "none") +
theme_sim_grammar()
ggplot(accuracy_summary, aes(rmse, estimand, colour = surface)) +
geom_segment(
aes(
x = rmse_low,
xend = rmse_high,
y = estimand,
yend = estimand,
colour = surface
),
linewidth = 0.9,
alpha = 0.8,
na.rm = TRUE
) +
geom_point(size = 2.4, na.rm = TRUE) +
facet_wrap(
~surface,
nrow = 1,
labeller = labeller(surface = as_labeller(surface_facet_labels))
) +
scale_colour_manual(values = surface_palette, drop = FALSE) +
labs(
title = "RMSE needs its own uncertainty display",
subtitle = "Facets are surfaces; points and bars are RMSE and 95% MCSE intervals for each estimand",
x = "Root mean-square error",
y = NULL,
colour = "Surface"
) +
scale_x_continuous(
breaks = c(0, 0.25, 0.50, 0.75),
limits = c(0, 0.82),
expand = expansion(mult = c(0.01, 0.05))
) +
scale_y_discrete(labels = estimand_display_labels) +
guides(colour = "none") +
theme_sim_grammar()
The missing rows are part of the message. A count pilot should not
quietly fill rho12 with zeros, and a proportion example
should not invent a random-effect SD if that surface was not fitted.
Real bias reports should replace the fixture quantiles above with one
row per replicate estimate or error, then overplot the aggregate mean
and MCSE interval produced by the Phase 18 summary helpers. RMSE should
remain an aggregate root-mean-square summary with its own MCSE interval,
not a cloud whose center could be mistaken for a mean absolute
error.
Coverage and Power
Coverage and power are proportions, so every point should carry the number of replicates that contributed to it and a Monte Carlo uncertainty interval. The fixture below uses a binomial MCSE approximation for display.
interval_summary <- data.frame(
surface = rep(surface_levels, each = 4),
estimand = rep(c("beta_x", "sigma", "sd_group", "rho12"), 4),
coverage = c(
0.95, 0.93, 0.91, 0.94,
0.94, 0.92, 0.90, NA,
0.92, NA, 0.86, NA,
0.95, 0.94, NA, 0.92
),
power = c(
0.82, 0.64, 0.58, 0.70,
0.78, 0.60, 0.52, NA,
0.74, NA, 0.46, NA,
0.80, 0.61, NA, 0.66
),
n_interval = c(
280, 280, 240, 260,
280, 280, 230, NA,
250, NA, 200, NA,
280, 280, NA, 240
)
)
interval_summary$surface <- factor(
interval_summary$surface,
levels = surface_levels
)
interval_long <- rbind(
data.frame(
interval_summary[c("surface", "estimand", "n_interval")],
measure = "Coverage",
value = interval_summary$coverage,
target = 0.95
),
data.frame(
interval_summary[c("surface", "estimand", "n_interval")],
measure = "Power",
value = interval_summary$power,
target = 0.80
)
)
interval_long$mcse <- sqrt(
interval_long$value * (1 - interval_long$value) / interval_long$n_interval
)
interval_long$lower <- pmax(0, interval_long$value - 1.96 * interval_long$mcse)
interval_long$upper <- pmin(1, interval_long$value + 1.96 * interval_long$mcse)
interval_long$measure <- factor(
interval_long$measure,
levels = c("Coverage", "Power")
)
interval_long$estimand <- factor(
interval_long$estimand,
levels = rev(c("beta_x", "sigma", "sd_group", "rho12"))
)
ggplot(interval_long, aes(value, estimand, colour = surface)) +
geom_vline(
aes(xintercept = target),
linetype = "dashed",
colour = "grey55"
) +
geom_segment(
aes(x = lower, xend = upper, yend = estimand),
linewidth = 0.55,
na.rm = TRUE
) +
geom_point(size = 2.1, na.rm = TRUE) +
facet_wrap(~measure, scales = "free_x") +
scale_colour_manual(values = surface_palette) +
scale_y_discrete(labels = estimand_display_labels) +
labs(
title = "Interval coverage and power need Monte Carlo uncertainty",
subtitle = "Dashed lines mark nominal coverage and a planned power benchmark",
x = "Proportion",
y = NULL,
colour = "Surface"
) +
theme_sim_grammar()
The plot should make pilot status obvious. With few replicates, a coverage estimate near 0.95 can still be too noisy for a release claim.
Convergence and Runtime
Convergence, positive-definite Hessian status, and runtime should sit beside accuracy. A surface with low bias but frequent warnings or high runtime is not ready for broad claims.
run_summary <- data.frame(
surface = surface_levels,
convergence_rate = c(0.98, 0.96, 0.90, 0.95),
pdhess_rate = c(0.96, 0.94, 0.86, 0.93),
elapsed_median = c(0.35, 0.28, 0.52, 0.44),
elapsed_p90 = c(0.72, 0.58, 1.35, 0.96)
)
run_summary$surface <- factor(run_summary$surface, levels = surface_levels)
run_long <- rbind(
data.frame(
surface = run_summary$surface,
measure = "Converged",
value = run_summary$convergence_rate
),
data.frame(
surface = run_summary$surface,
measure = "pdHess",
value = run_summary$pdhess_rate
),
data.frame(
surface = run_summary$surface,
measure = "Median runtime",
value = run_summary$elapsed_median
),
data.frame(
surface = run_summary$surface,
measure = "90th percentile runtime",
value = run_summary$elapsed_p90
)
)
run_long$panel <- ifelse(
grepl("runtime", run_long$measure),
"Runtime (seconds)",
"Fit status proportion"
)
run_long$measure <- factor(
run_long$measure,
levels = c(
"Converged",
"pdHess",
"Median runtime",
"90th percentile runtime"
)
)
ggplot(run_long, aes(value, surface, fill = measure)) +
geom_col(position = position_dodge(width = 0.72), width = 0.64) +
facet_wrap(~panel, scales = "free_x") +
scale_fill_manual(values = c(
"Converged" = "#0072B2",
"pdHess" = "#56B4E9",
"Median runtime" = "#009E73",
"90th percentile runtime" = "#E69F00"
)) +
labs(
title = "Convergence and runtime belong beside accuracy",
subtitle = "Fast successful fits and stable uncertainty are separate checks",
x = "Value",
y = NULL,
fill = "Measure"
) +
theme_sim_grammar()
Runtime summaries should report medians and high quantiles, not only means, because a few difficult cells can dominate overnight simulation planning.
Failure Ledger
Failures and warnings are data. They should be summarised by surface and condition instead of filtered away before plotting.
failure_ledger <- data.frame(
surface = rep(surface_levels, each = 5),
status = rep(c("ok", "warning", "boundary", "error", "skipped"), 4),
count = c(
268, 14, 10, 4, 4,
262, 18, 12, 5, 3,
232, 30, 20, 12, 6,
252, 20, 14, 8, 6
)
)
failure_ledger$surface <- factor(failure_ledger$surface, levels = surface_levels)
failure_ledger$status <- factor(
failure_ledger$status,
levels = c("ok", "warning", "boundary", "error", "skipped")
)
ggplot(failure_ledger, aes(surface, count, fill = status)) +
geom_col(width = 0.7) +
scale_fill_manual(values = c(
"ok" = "#009E73",
"warning" = "#E69F00",
"boundary" = "#56B4E9",
"error" = "#D55E00",
"skipped" = "#999999"
)) +
labs(
title = "Warnings and failures stay visible",
subtitle = "Failure ledgers keep hard cells in the evidence trail",
x = NULL,
y = "Replicate count",
fill = "Status"
) +
theme_sim_grammar()
Real reports should pair this figure with a table that preserves warning classes, error messages, seeds, and cell identifiers. The plot is for scanning; the ledger is for debugging and reproducibility.
Minimum Table Contract
A real simulation result can add columns, but it should not drop these fields:
| Field | Purpose |
|---|---|
surface |
Human-readable model surface, such as Gaussian location-scale, count, proportion, or meta-analysis |
cell_id |
The varied condition, such as sample size, effect size, true SD, or
known-V structure |
estimand |
The scientific target, such as beta_x,
sigma, sd_group, or rho12
|
truth, estimate, error
|
The value used for bias and RMSE |
conf.low, conf.high,
interval_status
|
The interval source and whether coverage is meaningful |
converged, pdHess,
boundary
|
Fit diagnostics that should be summarised and plotted |
elapsed, warning_count,
error_message
|
Runtime and failure-ledger fields |
n_replicate, mcse
|
Monte Carlo uncertainty for any aggregate claim |
The plotting grammar is deliberately conservative. A feature should not enter a comprehensive plot until the model surface is fitted, the estimand is named, the interval source is known, and failed replicates remain visible.