% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data-sim.R
\name{sim_dgp_scenarios}
\alias{sim_dgp_scenarios}
\alias{sim_dgp_correlated}
\alias{sim_dgp_mediated}
\alias{sim_dgp_confounded}
\alias{sim_dgp_interactions}
\alias{sim_dgp_independent}
\title{Simulation DGPs for Feature Importance Method Comparison}
\usage{
sim_dgp_correlated(n = 500L, r = 0.9)

sim_dgp_mediated(n = 500L)

sim_dgp_confounded(n = 500L, hidden = TRUE)

sim_dgp_interactions(n = 500L)

sim_dgp_independent(n = 500L)
}
\arguments{
\item{n}{(\code{integer(1)}: \code{500L}) Number of observations to generate.}

\item{r}{(\code{numeric(1)}: \code{0.9}) Correlation between x1 and x2. Must be between -1 and 1.}

\item{hidden}{(\code{logical(1)}: \code{TRUE}) Whether to hide the confounder from the returned task.
If \code{FALSE}, the confounder is included as a feature, allowing direct adjustment.
If \code{TRUE} (default), only the proxy is available, simulating unmeasured confounding.}
}
\value{
A regression task (\link[mlr3:TaskRegr]{mlr3::TaskRegr}) with \link[data.table:data.table]{data.table} backend.
}
\description{
These data generating processes (DGPs) are designed to illustrate specific
strengths and weaknesses of different feature importance methods like PFI, CFI, and RFI.
Each DGP focuses on one primary challenge to make the differences between methods clear.
}
\details{
\strong{Correlated Features DGP:}
This DGP creates highly correlated predictors where PFI will show artificially low
importance due to redundancy, while CFI will correctly identify each feature's
conditional contribution.

\strong{Mathematical Model:}
\deqn{(X_1, X_2)^T \sim \text{MVN}(0, \Sigma)}
where \eqn{\Sigma} is a \eqn{2 \times 2} covariance matrix with 1 on the diagonal and correlation \eqn{r} on the off-diagonal.
\deqn{X_3 \sim N(0,1), \quad X_4 \sim N(0,1)}
\deqn{Y = 2 \cdot X_1 + X_3 + \varepsilon}
where \eqn{\varepsilon \sim N(0, 0.2^2)}.

\strong{Feature Properties:}
\itemize{
\item \code{x1}: Standard normal from MVN, direct causal effect on y (\eqn{\beta = 2.0})
\item \code{x2}: Correlated with \code{x1} (correlation = \code{r}), NO causal effect on y (\eqn{\beta = 0})
\item \code{x3}: Independent standard normal, direct causal effect on y (\eqn{\beta = 1.0})
\item \code{x4}: Independent standard normal, no effect on y (\eqn{\beta = 0})
}

\strong{Expected Behavior:}
\itemize{
\item Will depend on the used learner and the strength of correlation (\code{r})
\item \strong{Marginal methods} (PFI, Marginal SAGE): Should falsely assign importance to x2 due to correlation with x1
\item \strong{CFI} Should correctly assign near-zero importance to x2
\item x2 is a "spurious predictor" - correlated with causal feature but not causal itself
}

\strong{Mediated Effects DGP:}
This DGP demonstrates the difference between total and direct causal effects.
Some features affect the outcome only through mediators.

\strong{Mathematical Model:}
\deqn{\text{exposure} \sim N(0,1), \quad \text{direct} \sim N(0,1)}
\deqn{\text{mediator} = 0.8 \cdot \text{exposure} + 0.6 \cdot \text{direct} + \varepsilon_m}
\deqn{Y = 1.5 \cdot \text{mediator} + 0.5 \cdot \text{direct} + \varepsilon}
where \eqn{\varepsilon_m \sim N(0, 0.3^2)} and \eqn{\varepsilon \sim N(0, 0.2^2)}.

\strong{Feature Properties:}
\itemize{
\item \code{exposure}: Has no direct effect on y, only through mediator (total effect = 1.2)
\item \code{mediator}: Mediates the effect of exposure on y
\item \code{direct}: Has both direct effect on y and effect on mediator
\item \code{noise}: No causal relationship to y
}

\strong{Causal Structure:} exposure -> mediator -> y <- direct -> mediator

\strong{Confounding DGP:}
This DGP includes a confounder that affects both a feature and the outcome.
Uses simple coefficients for easy interpretation.

\strong{Mathematical Model:}
\deqn{H \sim N(0,1)}
\deqn{X_1 = H + \varepsilon_1}
\deqn{\text{proxy} = H + \varepsilon_p, \quad \text{independent} \sim N(0,1)}
\deqn{Y = H + X_1 + \text{independent} + \varepsilon}
where all \eqn{\varepsilon \sim N(0, 0.5^2)} independently.

\strong{Model Structure:}
\itemize{
\item Confounder H ~ N(0,1) (potentially unobserved)
\item x1 = H + noise (affected by confounder)
\item proxy = H + noise (noisy measurement of confounder)
\item independent ~ N(0,1) (truly independent)
\item y = H + x1 + independent + noise
}

\strong{Expected Behavior:}
\itemize{
\item \strong{PFI}: Will show inflated importance for x1 due to confounding
\item \strong{CFI}: Should partially account for confounding through conditional sampling and reduce its importance
\item \strong{RFI conditioning on proxy}: Should reduce confounding bias by conditioning on proxy
}

\strong{Interaction Effects DGP:}
This DGP demonstrates a pure interaction effect where features have no main effects.

\strong{Mathematical Model:}
\deqn{Y = 2 \cdot X_1 \cdot X_2 + X_3 + \varepsilon}
where \eqn{X_j \sim N(0,1)} independently and \eqn{\varepsilon \sim N(0, 0.5^2)}.

\strong{Feature Properties:}
\itemize{
\item \code{x1}, \code{x2}: Independent features with ONLY interaction effect (no main effects)
\item \code{x3}: Independent feature with main effect only
\item \code{noise1}, \code{noise2}: No causal effects
}

\strong{Expected Behavior:}
\itemize{
\item Will depend on the used learner and its ability to model interactions
}

\strong{Independent Features DGP:}
This is a baseline scenario where all features are independent and their
effects are additive. All importance methods should give similar results.

\strong{Mathematical Model:}
\deqn{Y = 2.0 \cdot X_1 + 1.0 \cdot X_2 + 0.5 \cdot X_3 + \varepsilon}
where \eqn{X_j \sim N(0,1)} independently and \eqn{\varepsilon \sim N(0, 0.2^2)}.

\strong{Feature Properties:}
\itemize{
\item \code{important1-3}: Independent features with different effect sizes
\item \code{unimportant1-2}: Independent noise features with no effect
}

\strong{Expected Behavior:}
\itemize{
\item \strong{All methods}: Should rank features consistently by their true effect sizes
\item \strong{Ground truth}: important1 > important2 > important3 > unimportant1,2 (approximately 0)
}
}
\section{Functions}{
\itemize{
\item \code{sim_dgp_correlated()}: Correlated features demonstrating PFI's limitations

\item \code{sim_dgp_mediated()}: Mediated effects showing direct vs total importance

\item \code{sim_dgp_confounded()}: Confounding scenario for conditional sampling

\item \code{sim_dgp_interactions()}: Interaction effects between features

\item \code{sim_dgp_independent()}: Independent features baseline scenario

}}
\examples{
task = sim_dgp_correlated(200)
task$data()

# With different correlation
task_high_cor = sim_dgp_correlated(200, r = 0.95)
cor(task_high_cor$data()$x1, task_high_cor$data()$x2)
task = sim_dgp_mediated(200)
task$data()
# Hidden confounder scenario (traditional)
task_hidden = sim_dgp_confounded(200, hidden = TRUE)
task_hidden$feature_names  # proxy available but not confounder

# Observable confounder scenario
task_observed = sim_dgp_confounded(200, hidden = FALSE)
task_observed$feature_names  # both confounder and proxy available
task = sim_dgp_interactions(200)
task$data()
task = sim_dgp_independent(200)
task$data()
}
\references{
Ewald F, Bothmann L, Wright M, Bischl B, Casalicchio G, König G (2024).
\dQuote{A Guide to Feature Importance Methods for Scientific Inference.}
In Longo L, Lapuschkin S, Seifert C (eds.), \emph{Explainable Artificial Intelligence}, 440--464.
ISBN 978-3-031-63797-1, \doi{10.1007/978-3-031-63797-1_22}.
}
\seealso{
Other simulation: 
\code{\link{sim_dgp_ewald}()}

Other simulation: 
\code{\link{sim_dgp_ewald}()}

Other simulation: 
\code{\link{sim_dgp_ewald}()}

Other simulation: 
\code{\link{sim_dgp_ewald}()}

Other simulation: 
\code{\link{sim_dgp_ewald}()}
}
\concept{simulation}
