diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml deleted file mode 100644 index 458c1f7..0000000 --- a/.github/workflows/main.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v3 - env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/save-packages-cache.yaml b/.github/workflows/save-packages-cache.yaml new file mode 100644 index 0000000..9e7f0e0 --- /dev/null +++ b/.github/workflows/save-packages-cache.yaml @@ -0,0 +1,50 @@ +name: cache-R + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' # Specify the Python version you want to use + + - name: Install Package in Editable Mode with Python Dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + r-version: '4.3.2' # Use the R version you prefer + + - name: Install R packages + uses: r-lib/actions/setup-r-dependencies@v2 + with: + cache: true + cache-version: 1 + dependencies: 'NA' + install-pandoc: false + packages: | + grf + causalweight + mediation + + - name: Install plmed package + run: | + R -e "pak::pkg_install('ohines/plmed')" + + - name: Install Pytest + run: | + pip install pytest \ No newline at end of file diff --git a/.github/workflows/tests-with-R.yaml b/.github/workflows/tests-with-R.yaml new file mode 100644 index 0000000..93cc116 --- /dev/null +++ b/.github/workflows/tests-with-R.yaml @@ -0,0 +1,54 @@ +name: CI-with-R + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' # Specify the Python version you want to use + + - name: Install Package in Editable Mode with Python Dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + r-version: '4.3.2' # Use the R version you prefer + + - name: Install R packages + uses: r-lib/actions/setup-r-dependencies@v2 + with: + cache: true + cache-version: 1 + dependencies: 'NA' + install-pandoc: false + packages: | + grf + causalweight + mediation + + - name: Install plmed package + run: | + R -e "pak::pkg_install('ohines/plmed')" + + - name: Install Pytest + run: | + pip install pytest + + - name: Run tests + run: | + pytest \ No newline at end of file diff --git a/.github/workflows/tests-without-R.yaml b/.github/workflows/tests-without-R.yaml new file mode 100644 index 0000000..afe51cb --- /dev/null +++ b/.github/workflows/tests-without-R.yaml @@ -0,0 +1,33 @@ +name: CI-without-R + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' # Specify the Python version you want to use + + - name: Install Package in Editable Mode with Python Dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Install Pytest + run: | + pip install pytest + + - name: Run tests + run: | + pytest \ No newline at end of file diff --git a/.gitignore b/.gitignore index b6e4761..834d0a8 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,7 @@ dmypy.json # Pyre type checker .pyre/ + +# DS_STORE files +src/.DS_Store +.DS_Store diff --git a/setup.py b/setup.py index ca61aa1..ed1af7d 100644 --- a/setup.py +++ b/setup.py @@ -17,13 +17,14 @@ ), package_dir={"": "src"}, install_requires=[ - 'pandas>=1.2.1', + 'pandas==1.2.1', 'scikit-learn>=0.22.1', 'numpy>=1.19.2', 'rpy2>=2.9.4', 'scipy>=1.5.2', 'seaborn>=0.11.1', - 'matplotlib>=3.3.2' + 'matplotlib>=3.3.2', + "pytest" ], classifiers=[ 'Programming Language :: Python :: 3', diff --git a/src/med_bench/get_estimation.py b/src/med_bench/get_estimation.py index c2cf396..64f01ec 100644 --- a/src/med_bench/get_estimation.py +++ b/src/med_bench/get_estimation.py @@ -1,12 +1,8 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- - -import time -import sys -from rpy2.rinterface_lib.embedded import RRuntimeError -import pandas as pd import numpy as np + from .mediation import ( mediation_IPW, mediation_coefficient_product, @@ -18,6 +14,7 @@ r_mediate, ) + def get_estimation(x, t, m, y, estimator, config): """Wrapper estimator fonction ; calls an estimator given mediation data in order to estimate total, direct, and indirect effects. diff --git a/src/med_bench/get_simulated_data.py b/src/med_bench/get_simulated_data.py index 6b8eefa..b646390 100644 --- a/src/med_bench/get_simulated_data.py +++ b/src/med_bench/get_simulated_data.py @@ -1,13 +1,7 @@ import numpy as np from numpy.random import default_rng from scipy import stats -import pandas as pd -from pathlib import Path -from scipy.stats import bernoulli from scipy.special import expit -import matplotlib.pyplot as plt -import pathlib -import seaborn as sns def simulate_data(n, @@ -23,16 +17,16 @@ def simulate_data(n, beta_t_factor=1, beta_m_factor=1): """Simulate data for mediation analysis - + Parameters ---------- n: :obj:`int`, Number of samples to generate. - + rg: RandomState instance, Controls the pseudo random number generator used to generate the data at fit time. - + mis_spec_m: obj:`bool`, Whether the mediator generation is misspecified or not defaults to False @@ -40,7 +34,7 @@ def simulate_data(n, mis_spec_y: obj:`bool`, Whether the output model is misspecified or not defaults to False - + dim_x: :obj:`int`, optional, Number of covariates in the input. Defaults to 1 @@ -48,13 +42,13 @@ def simulate_data(n, dim_m: :obj:`int`, optional, Number of mediatiors to generate. Defaults to 1 - + seed: :obj:`int` or None, optional, Controls the pseudo random number generator used to generate the coefficients of the model. Pass an int for reproducible output across multiple function calls. Defaults to None - + type_m: :obj:`str`, Whether the mediator is binary or continuous Defaults to 'binary', @@ -66,7 +60,7 @@ def simulate_data(n, sigma_m :obj:`float`, noise variance on mediator Defaults to 0.5, - + beta_t_factor: :obj:`float`, scaling factor on treatment effect, Defaults to 1, @@ -74,18 +68,18 @@ def simulate_data(n, beta_m_factor: :obj:`float`, scaling factor on mediator, Defaults to 1, - + returns ------- x: ndarray of shape (n, dim_x) the simulated covariates - + t: ndarray of shape (n, 1) the simulated treatment - + m: ndarray of shape (n, dim_m) the simulated mediators - + y: ndarray of shape (n, 1) the simulated outcome @@ -137,9 +131,11 @@ def simulate_data(n, m = m_2d[np.arange(n), t[:, 0]].reshape(-1, 1) else: random_noise = sigma_m * rg.standard_normal((n, dim_m)) - m0 = x.dot(beta_x) + t0.dot(beta_t) + t0 * (x.dot(beta_xt)) + random_noise - m1 = x.dot(beta_x) + t1.dot(beta_t) + t1 * (x.dot(beta_xt)) + random_noise - m = x.dot(beta_x) + t.dot(beta_t) + t * (x.dot(beta_xt)) + random_noise + m0 = x.dot(beta_x) + t0.dot(beta_t) + t0 * \ + (x.dot(beta_xt)) + random_noise + m1 = x.dot(beta_x) + t1.dot(beta_t) + t1 * \ + (x.dot(beta_xt)) + random_noise + m = x.dot(beta_x) + t.dot(beta_t) + t * (x.dot(beta_xt)) + random_noise # generate the outcome Y gamma_m = np.ones((dim_m, 1)) * 0.5 / dim_m * beta_m_factor @@ -150,31 +146,38 @@ def simulate_data(n, else: gamma_t_m = np.zeros((dim_m, 1)) - y = x.dot(gamma_x) + gamma_t * t + m.dot(gamma_m) + m.dot(gamma_t_m) * t + sigma_y * rg.standard_normal((n, 1)) + y = x.dot(gamma_x) + gamma_t * t + m.dot(gamma_m) + \ + m.dot(gamma_t_m) * t + sigma_y * rg.standard_normal((n, 1)) # Compute differents types of effects if type_m == 'binary': theta_1 = gamma_t + gamma_t_m * np.mean(p_m1) theta_0 = gamma_t + gamma_t_m * np.mean(p_m0) - delta_1 = np.mean((p_m1 - p_m0) * (gamma_m.flatten() + gamma_t_m.dot(t1.T))) - delta_0 = np.mean((p_m1 - p_m0) * (gamma_m.flatten() + gamma_t_m.dot(t0.T))) + delta_1 = np.mean( + (p_m1 - p_m0) * (gamma_m.flatten() + gamma_t_m.dot(t1.T))) + delta_0 = np.mean( + (p_m1 - p_m0) * (gamma_m.flatten() + gamma_t_m.dot(t0.T))) else: - theta_1 = gamma_t + gamma_t_m.T.dot(np.mean(m1, axis=0)) # to do mean(m1) pour avoir un vecteur de taille dim_m + # to do mean(m1) pour avoir un vecteur de taille dim_m + theta_1 = gamma_t + gamma_t_m.T.dot(np.mean(m1, axis=0)) theta_0 = gamma_t + gamma_t_m.T.dot(np.mean(m0, axis=0)) - delta_1 = (gamma_t * t1 + m1.dot(gamma_m) + m1.dot(gamma_t_m) * t1 - (gamma_t * t1 + m0.dot(gamma_m) + m0.dot(gamma_t_m) * t1)).mean() - delta_0 = (gamma_t * t0 + m1.dot(gamma_m) + m1.dot(gamma_t_m) * t0 - (gamma_t * t0 + m0.dot(gamma_m) + m0.dot(gamma_t_m) * t0)).mean() + delta_1 = (gamma_t * t1 + m1.dot(gamma_m) + m1.dot(gamma_t_m) * t1 - + (gamma_t * t1 + m0.dot(gamma_m) + m0.dot(gamma_t_m) * t1)).mean() + delta_0 = (gamma_t * t0 + m1.dot(gamma_m) + m1.dot(gamma_t_m) * t0 - + (gamma_t * t0 + m0.dot(gamma_m) + m0.dot(gamma_t_m) * t0)).mean() if type_m == 'binary': pre_pm = np.hstack((p_m0.reshape(-1, 1), p_m1.reshape(-1, 1))) - pre_pm[m.ravel()==0, :] = 1 - pre_pm[m.ravel()==0, :] + pre_pm[m.ravel() == 0, :] = 1 - pre_pm[m.ravel() == 0, :] pm = pre_pm[:, 1].reshape(-1, 1) else: - p_m0 = np.prod(stats.norm.pdf((m - x.dot(beta_x)) - t0.dot(beta_t) - t0 * (x.dot(beta_xt)) / sigma_m), axis=1) - p_m1 = np.prod(stats.norm.pdf((m - x.dot(beta_x)) - t1.dot(beta_t) - t1 * (x.dot(beta_xt)) / sigma_m), axis=1) + p_m0 = np.prod(stats.norm.pdf((m - x.dot(beta_x)) - + t0.dot(beta_t) - t0 * (x.dot(beta_xt)) / sigma_m), axis=1) + p_m1 = np.prod(stats.norm.pdf((m - x.dot(beta_x)) - + t1.dot(beta_t) - t1 * (x.dot(beta_xt)) / sigma_m), axis=1) pre_pm = np.hstack((p_m0.reshape(-1, 1), p_m1.reshape(-1, 1))) pm = pre_pm[:, 1].reshape(-1, 1) - px = np.prod(stats.norm.pdf(x), axis=1) pre_pt = np.hstack(((1-p_t).reshape(-1, 1), p_t.reshape(-1, 1))) @@ -182,15 +185,15 @@ def simulate_data(n, denom = np.sum(pre_pm * pre_pt * double_px, axis=1) num = pm.ravel() * p_t.ravel() * px.ravel() th_p_t_mx = num.ravel() / denom - - return (x, - t, - m, - y, + + return (x, + t, + m, + y, theta_1.flatten()[0] + delta_0.flatten()[0], - theta_1.flatten()[0], - theta_0.flatten()[0], + theta_1.flatten()[0], + theta_0.flatten()[0], delta_1.flatten()[0], - delta_0.flatten()[0], - p_t, - th_p_t_mx) \ No newline at end of file + delta_0.flatten()[0], + p_t, + th_p_t_mx) diff --git a/src/med_bench/mediation.py b/src/med_bench/mediation.py index 9175ea5..6af79ab 100644 --- a/src/med_bench/mediation.py +++ b/src/med_bench/mediation.py @@ -8,19 +8,9 @@ import numpy as np import pandas as pd -import rpy2.robjects as robjects -import rpy2.robjects.packages as rpackages -from numpy.random import default_rng -from rpy2.robjects import numpy2ri, pandas2ri -from scipy import stats -from scipy.special import expit -from scipy.stats import bernoulli from sklearn.base import clone -from sklearn.calibration import CalibratedClassifierCV -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LassoCV, LogisticRegressionCV, RidgeCV -from sklearn.model_selection import KFold -from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import RidgeCV + from .utils.nuisances import (_estimate_conditional_mean_outcome, _estimate_cross_conditional_mean_outcome, @@ -28,17 +18,7 @@ _estimate_mediator_density, _estimate_treatment_probabilities, _get_classifier, _get_regressor) -from .utils.utils import _convert_array_to_R - -pandas2ri.activate() -numpy2ri.activate() - -causalweight = rpackages.importr('causalweight') -mediation = rpackages.importr('mediation') -Rstats = rpackages.importr('stats') -base = rpackages.importr('base') -grf = rpackages.importr('grf') -plmed = rpackages.importr('plmed') +from .utils.utils import r_dependency_required, _check_input ALPHAS = np.logspace(-5, 5, 8) CV_FOLDS = 5 @@ -110,6 +90,9 @@ def mediation_IPW(y, t, m, x, trim, regularization=True, forest=False, int number of used observations (non trimmed) """ + # check input + y, t, m, x = _check_input(y, t, m, x, setting='multidimensional') + # estimate propensities classifier_t_x = _get_classifier(regularization, forest, calibration) classifier_t_xm = _get_classifier(regularization, forest, calibration) @@ -136,12 +119,12 @@ def mediation_IPW(y, t, m, x, trim, regularization=True, forest=False, y0m1 = np.sum(y * (1 - t) * p_xm / ((1 - p_xm) * p_x)) /\ np.sum((1 - t) * p_xm / ((1 - p_xm) * p_x)) - return(y1m1 - y0m0, - y1m1 - y0m1, - y1m0 - y0m0, - y1m1 - y1m0, - y0m1 - y0m0, - np.sum(ind)) + return (y1m1 - y0m0, + y1m1 - y0m1, + y1m0 - y0m0, + y1m1 - y1m0, + y0m1 - y0m0, + np.sum(ind)) def mediation_coefficient_product(y, t, m, x, interaction=False, @@ -199,12 +182,13 @@ def mediation_coefficient_product(y, t, m, x, interaction=False, alphas = ALPHAS else: alphas = [TINY] - if len(x.shape) == 1: - x = x.reshape(-1, 1) - if len(m.shape) == 1: - m = m.reshape(-1, 1) + + # check input + y, t, m, x = _check_input(y, t, m, x, setting='multidimensional') + if len(t.shape) == 1: t = t.reshape(-1, 1) + coef_t_m = np.zeros(m.shape[1]) for i in range(m.shape[1]): m_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\ @@ -216,12 +200,12 @@ def mediation_coefficient_product(y, t, m, x, interaction=False, # return total, direct and indirect effect direct_effect = y_reg.coef_[x.shape[1]] indirect_effect = sum(y_reg.coef_[x.shape[1] + 1:] * coef_t_m) - return [direct_effect + indirect_effect, + return (direct_effect + indirect_effect, direct_effect, direct_effect, indirect_effect, indirect_effect, - None] + None) def mediation_g_formula(y, t, m, x, interaction=False, forest=False, @@ -268,6 +252,9 @@ def mediation_g_formula(y, t, m, x, interaction=False, forest=False, calibration : str, default=sigmoid calibration mode; for example using a sigmoid function """ + # check input + y, t, m, x = _check_input(y, t, m, x, setting='binary') + # estimate mediator densities classifier_m = _get_classifier(regularization, forest, calibration) f_00x, f_01x, f_10x, f_11x, _, _ = _estimate_mediator_density(t, m, x, y, @@ -297,12 +284,12 @@ def mediation_g_formula(y, t, m, x, interaction=False, forest=False, + indirect_effect_i0 * mu_00x).sum() / n total_effect = direct_effect_control + indirect_effect_treated - return [total_effect, + return (total_effect, direct_effect_treated, direct_effect_control, indirect_effect_treated, indirect_effect_control, - None] + None) def alternative_estimator(y, t, m, x, regularization=True): @@ -339,10 +326,10 @@ def alternative_estimator(y, t, m, x, regularization=True): alphas = ALPHAS else: alphas = [TINY] - if len(x.shape) == 1: - x = x.reshape(-1, 1) - if len(m.shape) == 1: - m = m.reshape(-1, 1) + + # check input + y, t, m, x = _check_input(y, t, m, x, setting='multidimensional') + m = m.ravel() treated = (t == 1) # computation of direct effect @@ -365,12 +352,12 @@ def alternative_estimator(y, t, m, x, regularization=True): # computation of indirect effect indirect_effect = total_effect - direct_effect - return [total_effect, + return (total_effect, direct_effect, direct_effect, indirect_effect, indirect_effect, - None] + None) def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False, @@ -453,29 +440,9 @@ def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False, - If x, t, m, or y don't have the same length. - If m is not binary. """ - # Format checking - if len(y) != len(y.ravel()): - raise ValueError("Multidimensional y is not supported") - if len(t) != len(t.ravel()): - raise ValueError("Multidimensional t is not supported") - if len(m) != len(m.ravel()): - raise ValueError("Multidimensional m is not supported") - - n = len(y) - if len(x.shape) == 1: - x.reshape(n, 1) - if len(m.shape) == 1: - m.reshape(n, 1) - - dim_m = m.shape[1] - if n * dim_m != sum(m.ravel() == 1) + sum(m.ravel() == 0): - raise ValueError("m is not binary") - - y = y.ravel() - t = t.ravel() + # check input + y, t, m, x = _check_input(y, t, m, x, setting='binary') m = m.ravel() - if n != len(x) or n != len(m) or n != len(t): - raise ValueError("Inputs don't have the same number of observations") # estimate propensities classifier_t_x = _get_classifier(regularization, forest, calibration) @@ -523,28 +490,28 @@ def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False, y0m0 = (((1 - t) / (1 - p_x) * (y - E_mu_t0_t0)) / sum_score_m0 + E_mu_t0_t0) y1m0 = ( - ((t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx)) / sum_score_t1m0 - + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) / sum_score_m0 - + E_mu_t1_t0 + ((t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx)) / sum_score_t1m0 + + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) / sum_score_m0 + + E_mu_t1_t0 ) y0m1 = ( - ((1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx)) - / sum_score_t0m1 + t / p_x * ( - mu_0mx - E_mu_t0_t1) / sum_score_m1 - + E_mu_t0_t1 + ((1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx)) + / sum_score_t0m1 + t / p_x * ( + mu_0mx - E_mu_t0_t1) / sum_score_m1 + + E_mu_t0_t1 ) else: y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1 y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0 y1m0 = ( - (t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx) - + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) - + E_mu_t1_t0 + (t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx) + + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) + + E_mu_t1_t0 ) y0m1 = ( - (1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx) - + t / p_x * (mu_0mx - E_mu_t0_t1) - + E_mu_t0_t1 + (1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx) + + t / p_x * (mu_0mx - E_mu_t0_t1) + + E_mu_t0_t1 ) # effects computing @@ -557,6 +524,7 @@ def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False, return total, direct1, direct0, indirect1, indirect0, n_discarded +@r_dependency_required(['mediation', 'stats', 'base']) def r_mediate(y, t, m, x, interaction=False): """ This function calls the R function mediate from the package mediation @@ -581,7 +549,22 @@ def r_mediate(y, t, m, x, interaction=False): whether to include interaction terms in the model interactions are terms XT, TM, MX """ + + import rpy2.robjects as robjects + import rpy2.robjects.packages as rpackages + from rpy2.robjects import numpy2ri, pandas2ri + + pandas2ri.activate() + numpy2ri.activate() + + mediation = rpackages.importr('mediation') + Rstats = rpackages.importr('stats') + base = rpackages.importr('base') + + # check input + y, t, m, x = _check_input(y, t, m, x, setting='binary') m = m.ravel() + var_names = [[y, 'y'], [t, 't'], [m, 'm'], @@ -619,12 +602,27 @@ def r_mediate(y, t, m, x, interaction=False): return to_return + [None] +@r_dependency_required(['plmed', 'base']) def r_mediation_g_estimator(y, t, m, x): """ This function calls the R G-estimator from the package plmed (https://github.com/ohines/plmed) """ + + import rpy2.robjects as robjects + import rpy2.robjects.packages as rpackages + from rpy2.robjects import numpy2ri, pandas2ri + + pandas2ri.activate() + numpy2ri.activate() + + plmed = rpackages.importr('plmed') + base = rpackages.importr('base') + + # check input + y, t, m, x = _check_input(y, t, m, x, setting='binary') m = m.ravel() + var_names = [[y, 'y'], [t, 't'], [m, 'm'], @@ -653,14 +651,15 @@ def r_mediation_g_estimator(y, t, m, x): data=base.as_symbol('df')) direct_effect = res.rx2('coef')[0] indirect_effect = res.rx2('coef')[1] - return [direct_effect + indirect_effect, + return (direct_effect + indirect_effect, direct_effect, direct_effect, indirect_effect, indirect_effect, - None] + None) +@r_dependency_required(['causalweight', 'base']) def r_mediation_DML(y, t, m, x, trim=0.05, order=1): """ This function calls the R Double Machine Learning estimator from the @@ -696,6 +695,20 @@ def r_mediation_DML(y, t, m, x, trim=0.05, order=1): Polynomials/interactions are created using the Generate. Powers command of the LARF package. """ + + import rpy2.robjects.packages as rpackages + from rpy2.robjects import numpy2ri, pandas2ri + from .utils.utils import _convert_array_to_R + + pandas2ri.activate() + numpy2ri.activate() + + causalweight = rpackages.importr('causalweight') + base = rpackages.importr('base') + + # check input + y, t, m, x = _check_input(y, t, m, x, setting='multidimensional') + x_r, t_r, m_r, y_r = [base.as_matrix(_convert_array_to_R(uu)) for uu in (x, t, m, y)] res = causalweight.medDML(y_r, t_r, m_r, x_r, trim=trim, order=order) @@ -785,29 +798,12 @@ def mediation_DML(y, t, m, x, forest=False, crossfit=0, trim=0.05, - If t or y are multidimensional. - If x, t, m, or y don't have the same length. """ - # check format - if len(y) != len(y.ravel()): - raise ValueError("Multidimensional y is not supported") - - if len(t) != len(t.ravel()): - raise ValueError("Multidimensional t is not supported") - + # check input + y, t, m, x = _check_input(y, t, m, x, setting='multidimensional') n = len(y) - t = t.ravel() - y = y.ravel() - - if n != len(x) or n != len(m) or n != len(t): - raise ValueError("Inputs don't have the same number of observations") - - if len(x.shape) == 1: - x.reshape(n, 1) - - if len(m.shape) == 1: - m.reshape(n, 1) nobs = 0 - var_name = [ "p_x", "p_xm", @@ -892,4 +888,4 @@ def mediation_DML(y, t, m, x, forest=False, crossfit=0, trim=0.05, direct0 = my1m0 - my0m0 indirect1 = my1m1 - my1m0 indirect0 = my0m1 - my0m0 - return total, direct1, direct0, indirect1, indirect0, n - nobs \ No newline at end of file + return total, direct1, direct0, indirect1, indirect0, n - nobs diff --git a/src/med_bench/utils/constants.py b/src/med_bench/utils/constants.py new file mode 100644 index 0000000..4f457c6 --- /dev/null +++ b/src/med_bench/utils/constants.py @@ -0,0 +1,154 @@ +import itertools +import numpy as np +from numpy.random import default_rng + +# CONSTANTS USED FOR TESTS + +# TOLERANCE THRESHOLDS + +TOLERANCE_THRESHOLDS = { + "SMALL": { + "ATE": 0.05, + "DIRECT": 0.05, + "INDIRECT": 0.2, + }, + "MEDIUM": { + "ATE": 0.10, + "DIRECT": 0.10, + "INDIRECT": 0.4, + }, + "LARGE": { + "ATE": 0.15, + "DIRECT": 0.15, + "INDIRECT": 0.8, + }, + "INFINITE": { + "ATE": np.inf, + "DIRECT": np.inf, + "INDIRECT": np.inf, + }, +} + + +def get_tolerance_array(tolerance_size: str) -> np.array: + """Get tolerance array for tolerance tests + + Parameters + ---------- + tolerance_size : str + tolerance size, can be "SMALL", "MEDIUM", "LARGE" or "INFINITE" + + Returns + ------- + np.array + array of size 5 containing the ATE, DIRECT (*2) and INDIRECT (*2) effects tolerance + """ + + return np.array( + [ + TOLERANCE_THRESHOLDS[tolerance_size]["ATE"], + TOLERANCE_THRESHOLDS[tolerance_size]["DIRECT"], + TOLERANCE_THRESHOLDS[tolerance_size]["DIRECT"], + TOLERANCE_THRESHOLDS[tolerance_size]["INDIRECT"], + TOLERANCE_THRESHOLDS[tolerance_size]["INDIRECT"], + ] + ) + + +SMALL_TOLERANCE = get_tolerance_array("SMALL") +MEDIUM_TOLERANCE = get_tolerance_array("MEDIUM") +LARGE_TOLERANCE = get_tolerance_array("LARGE") +INFINITE_TOLERANCE = get_tolerance_array("INFINITE") + +TOLERANCE_DICT = { + "coefficient_product": LARGE_TOLERANCE, + "mediation_ipw_noreg": INFINITE_TOLERANCE, + "mediation_ipw_reg": INFINITE_TOLERANCE, + "mediation_ipw_reg_calibration": INFINITE_TOLERANCE, + "mediation_ipw_forest": INFINITE_TOLERANCE, + "mediation_ipw_forest_calibration": INFINITE_TOLERANCE, + "mediation_g_computation_noreg": LARGE_TOLERANCE, + "mediation_g_computation_reg": MEDIUM_TOLERANCE, + "mediation_g_computation_reg_calibration": LARGE_TOLERANCE, + "mediation_g_computation_forest": LARGE_TOLERANCE, + "mediation_g_computation_forest_calibration": INFINITE_TOLERANCE, + "mediation_multiply_robust_noreg": INFINITE_TOLERANCE, + "mediation_multiply_robust_reg": LARGE_TOLERANCE, + "mediation_multiply_robust_reg_calibration": LARGE_TOLERANCE, + "mediation_multiply_robust_forest": INFINITE_TOLERANCE, + "mediation_multiply_robust_forest_calibration": LARGE_TOLERANCE, + "simulation_based": LARGE_TOLERANCE, + "mediation_DML": INFINITE_TOLERANCE, + "mediation_DML_reg_fixed_seed": INFINITE_TOLERANCE, + "mediation_g_estimator": SMALL_TOLERANCE, + "mediation_ipw_noreg_cf": INFINITE_TOLERANCE, + "mediation_ipw_reg_cf": INFINITE_TOLERANCE, + "mediation_ipw_reg_calibration_cf": INFINITE_TOLERANCE, + "mediation_ipw_forest_cf": INFINITE_TOLERANCE, + "mediation_ipw_forest_calibration_cf": INFINITE_TOLERANCE, + "mediation_g_computation_noreg_cf": SMALL_TOLERANCE, + "mediation_g_computation_reg_cf": LARGE_TOLERANCE, + "mediation_g_computation_reg_calibration_cf": LARGE_TOLERANCE, + "mediation_g_computation_forest_cf": INFINITE_TOLERANCE, + "mediation_g_computation_forest_calibration_cf": LARGE_TOLERANCE, + "mediation_multiply_robust_noreg_cf": MEDIUM_TOLERANCE, + "mediation_multiply_robust_reg_cf": LARGE_TOLERANCE, + "mediation_multiply_robust_reg_calibration_cf": MEDIUM_TOLERANCE, + "mediation_multiply_robust_forest_cf": INFINITE_TOLERANCE, + "mediation_multiply_robust_forest_calibration_cf": INFINITE_TOLERANCE, +} + +ESTIMATORS = list(TOLERANCE_DICT.keys()) + +# PARAMETERS VALUES FOR DATA GENERATION + +PARAMETER_NAME = [ + "n", + "rg", + "mis_spec_m", + "mis_spec_y", + "dim_x", + "dim_m", + "seed", + "type_m", + "sigma_y", + "sigma_m", + "beta_t_factor", + "beta_m_factor", +] + +PARAMETER_LIST = list( + itertools.product( + [1000], + [default_rng(321)], + [False], + [False], + [1, 5], + [1], + [123], + ["binary"], + [0.5], + [0.5], + [0.5], + [0.5], + ) +) + +PARAMETER_LIST.extend( + list( + itertools.product( + [1000], + [default_rng(321)], + [False], + [False], + [1, 5], + [1, 5], + [123], + ["continuous"], + [0.5], + [0.5], + [0.5], + [0.5], + ) + ) +) diff --git a/src/med_bench/utils/nuisances.py b/src/med_bench/utils/nuisances.py index fd23c90..ded68f0 100644 --- a/src/med_bench/utils/nuisances.py +++ b/src/med_bench/utils/nuisances.py @@ -3,19 +3,17 @@ used in mediation estimators in causal inference """ import numpy as np -import pandas as pd -from numpy.random import default_rng -from scipy import stats -from scipy.special import expit -from scipy.stats import bernoulli from sklearn.base import clone from sklearn.calibration import CalibratedClassifierCV from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LassoCV, LogisticRegressionCV, RidgeCV +from sklearn.linear_model import LogisticRegressionCV, RidgeCV from sklearn.model_selection import KFold -from sklearn.preprocessing import PolynomialFeatures -from .utils import _convert_array_to_R, _get_interactions +from .utils import check_r_dependencies, _get_interactions + +if check_r_dependencies(): + from .utils import _convert_array_to_R + ALPHAS = np.logspace(-5, 5, 8) CV_FOLDS = 5 @@ -99,7 +97,8 @@ def _get_regressor(regularization, forest, random_state=42): if not forest: reg = RidgeCV(alphas=alphas, cv=CV_FOLDS) else: - reg = RandomForestRegressor(n_estimators=100, min_samples_leaf=10) + reg = RandomForestRegressor( + n_estimators=100, min_samples_leaf=10, random_state=random_state) return reg @@ -191,7 +190,6 @@ def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction): # f_mtx model fitting clf_m = clf_m.fit(t_x[train_index, :], m[train_index]) - #clf_m = clf_m.fit(t_x[train_index, :], m.ravel()[train_index]) # predict f(M=m|T=t,X) fm_0 = clf_m.predict_proba(t0_x[test_index, :]) @@ -364,12 +362,12 @@ def _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, reg_y, # predict E[E[Y|T=1,M=m,X]|T=t,X] E_mu_t1_t0[test_index] = ( - reg_y_t1m0_t0.predict(x[test_index, :]) * f_00x[test_index] - + reg_y_t1m1_t0.predict(x[test_index, :]) * f_01x[test_index] + reg_y_t1m0_t0.predict(x[test_index, :]) * f_00x[test_index] + + reg_y_t1m1_t0.predict(x[test_index, :]) * f_01x[test_index] ) E_mu_t1_t1[test_index] = ( - reg_y_t1m0_t1.predict(x[test_index, :]) * f_10x[test_index] - + reg_y_t1m1_t1.predict(x[test_index, :]) * f_11x[test_index] + reg_y_t1m0_t1.predict(x[test_index, :]) * f_10x[test_index] + + reg_y_t1m1_t1.predict(x[test_index, :]) * f_11x[test_index] ) # E[E[Y|T=0,M=m,X]|T=t,X] model fitting @@ -388,12 +386,12 @@ def _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, reg_y, # predict E[E[Y|T=0,M=m,X]|T=t,X] E_mu_t0_t0[test_index] = ( - reg_y_t0m0_t0.predict(x[test_index, :]) * f_00x[test_index] - + reg_y_t0m1_t0.predict(x[test_index, :]) * f_01x[test_index] + reg_y_t0m0_t0.predict(x[test_index, :]) * f_00x[test_index] + + reg_y_t0m1_t0.predict(x[test_index, :]) * f_01x[test_index] ) E_mu_t0_t1[test_index] = ( - reg_y_t0m0_t1.predict(x[test_index, :]) * f_10x[test_index] - + reg_y_t0m1_t1.predict(x[test_index, :]) * f_11x[test_index] + reg_y_t0m0_t1.predict(x[test_index, :]) * f_10x[test_index] + + reg_y_t0m1_t1.predict(x[test_index, :]) * f_11x[test_index] ) return mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 diff --git a/src/med_bench/utils/utils.py b/src/med_bench/utils/utils.py index 23ea3e7..6b487a9 100644 --- a/src/med_bench/utils/utils.py +++ b/src/med_bench/utils/utils.py @@ -1,8 +1,92 @@ import numpy as np - import rpy2.robjects as robjects -import rpy2.robjects.packages as rpackages -from rpy2.robjects import pandas2ri, numpy2ri +from sklearn.utils.validation import check_array + +import subprocess +import warnings + + +def check_r_dependencies(): + try: + # Check if R is accessible by trying to get its version + subprocess.check_output(["R", "--version"]) + + # If the above command fails, it will raise a subprocess.CalledProcessError and won't reach here + + # Assuming reaching here means R is accessible, now try importing rpy2 packages + import rpy2.robjects.packages as rpackages + required_packages = [ + 'causalweight', 'mediation', 'stats', 'base', 'grf', 'plmed' + ] + + for package in required_packages: + rpackages.importr(package) + + return True # All checks passed, R and required packages are available + + except: + # Handle the case where R is not found or rpy2 is not installed + return False + + +def is_r_installed(): + try: + subprocess.check_output(["R", "--version"]) + return True + except: + return False + + +def check_r_package(package_name): + try: + import rpy2.robjects.packages as rpackages + rpackages.importr(package_name) + return True + except: + return False + + +def r_dependency_required(required_packages): + def decorator(func): + def wrapper(*args, **kwargs): + if not is_r_installed(): + warnings.warn( + "R is not installed or not found. " + "Please install R and set it up correctly in your system." + ) + return None + + for package in required_packages: + if not check_r_package(package): + if package != 'plmed': + warnings.warn( + f"The '{package}' R package is not installed. " + "Please install it using R by running:\n" + "import rpy2.robjects.packages as rpackages\n" + "utils = rpackages.importr('utils')\n" + "utils.chooseCRANmirror(ind=33)\n" + f"utils.install_packages('{package}')" + ) + else: + warnings.warn( + "The 'plmed' R package is not installed. " + "Please install it using R by running:\n" + "import rpy2.robjects.packages as rpackages\n" + "utils = rpackages.importr('utils')\n" + "utils.chooseCRANmirror(ind=33)\n" + "utils.install_packages('devtools')\n" + "devtools = rpackages.importr('devtools')\n" + "devtools.install_github('ohines/plmed')" + ) + return None + return func(*args, **kwargs) + return wrapper + return decorator + + +if is_r_installed(): + import rpy2.robjects as robjects + def _get_interactions(interaction, *args): """ @@ -41,7 +125,7 @@ def _get_interactions(interaction, *args): variables = list(args) for index, var in enumerate(variables): if len(var.shape) == 1: - variables[index] = var.reshape(-1,1) + variables[index] = var.reshape(-1, 1) pre_inter_variables = np.hstack(variables) if not interaction: return pre_inter_variables @@ -55,6 +139,7 @@ def _get_interactions(interaction, *args): result = np.hstack((pre_inter_variables, new_vars)) return result + def _convert_array_to_R(x): """ converts a numpy array to a R matrix or vector @@ -67,3 +152,76 @@ def _convert_array_to_R(x): elif len(x.shape) == 2: return robjects.r.matrix(robjects.FloatVector(x.ravel()), nrow=x.shape[0], byrow='TRUE') + + +def _check_input(y, t, m, x, setting): + """ + internal function to check inputs + Parameters + ---------- + y : array-like, shape (n_samples) + Outcome value for each unit, continuous + + t : array-like, shape (n_samples) + Treatment value for each unit, binary + + m : array-like, shape (n_samples, n_mediators) + Mediator value for each unit, binary and unidimensional + + x : array-like, shape (n_samples, n_features_covariates) + Covariates value for each unit, continuous + + setting : string + ('binary', 'continuous', 'multidimensional') value for the mediator + + Returns + ------- + y_converted : array-like, shape (n_samples,) + Outcome value for each unit, continuous + + t_converted : array-like, shape (n_samples,) + Treatment value for each unit, binary + + m_converted : array-like, shape (n_samples, n_mediators) + Mediator value for each unit, binary and unidimensional + + x_converted : array-like, shape (n_samples, n_features_covariates) + Covariates value for each unit, continuous + """ + # check format + if len(y) != len(y.ravel()): + raise ValueError("Multidimensional y (outcome) is not supported") + + if len(t) != len(t.ravel()): + raise ValueError("Multidimensional t (exposure) is not supported") + + if len(np.unique(t)) != 2: + raise ValueError("Only a binary t (exposure) is supported") + + n = len(y) + t_converted = t.ravel() + y_converted = y.ravel() + + if n != len(x) or n != len(m) or n != len(t): + raise ValueError("Inputs don't have the same number of observations") + + if len(x.shape) == 1: + x_converted = x.reshape(n, 1) + else: + x_converted = x + + if len(m.shape) == 1: + m_converted = m.reshape(n, 1) + else: + m_converted = m + + if (m_converted.shape[1] >1) and (setting != 'multidimensional'): + raise ValueError("Multidimensional m (mediator) is not supported") + + if (setting == 'binary') and (len(np.unique(m)) != 2): + raise ValueError( + "Only a binary one-dimensional m (mediator) is supported") + + return y_converted, t_converted, m_converted, x_converted + + diff --git a/src/tests/estimation/generate_tests_results.py b/src/tests/estimation/generate_tests_results.py new file mode 100644 index 0000000..b698a05 --- /dev/null +++ b/src/tests/estimation/generate_tests_results.py @@ -0,0 +1,63 @@ +import numpy as np + +from med_bench.get_simulated_data import simulate_data +from med_bench.get_estimation import get_estimation + +from med_bench.utils.constants import ESTIMATORS, PARAMETER_LIST, PARAMETER_NAME + + +def _get_data_from_list(data): + """Get x, t, m, y from simulated data + """ + x = data[0] + t = data[1].ravel() + m = data[2] + y = data[3].ravel() + + return x, t, m, y + + +def _get_config_from_dict(dict_params): + """Get config parameter used for estimators parametrisation + """ + if dict_params["dim_m"] == 1 and dict_params["type_m"] == "binary": + config = 0 + else: + config = 5 + return config + + +def _get_estimators_results(x, t, m, y, config, estimator): + """Get estimation result from specified input parameters and estimator name + """ + + try: + res = get_estimation(x, t, m, y, estimator, config)[0:5] + return res + + except Exception as e: + print(f"{e}") + return str(e) + + +if __name__ == "__main__": + + results = [] + + for param_list in PARAMETER_LIST: + + # Get synthetic input data from parameters list defined above + dict_params = dict(zip(PARAMETER_NAME, param_list)) + data = simulate_data(**dict_params) + x, t, m, y = _get_data_from_list(data) + config = _get_config_from_dict(dict_params=dict_params) + + for estimator in ESTIMATORS: + + # Get results from synthetic inputs + result = _get_estimators_results(x, t, m, y, config, estimator) + row = [estimator, x, t, m, y, config, result] + results.append(row) + + # Store the results in a npy file + np.save("tests_results.npy", np.array(results, dtype="object")) diff --git a/src/tests/estimation/test_exact_estimation.py b/src/tests/estimation/test_exact_estimation.py new file mode 100644 index 0000000..bf36b5f --- /dev/null +++ b/src/tests/estimation/test_exact_estimation.py @@ -0,0 +1,102 @@ +""" +Pytest file for get_estimation.py + +It tests all the benchmark_mediation estimators : +- for a certain tolerance +- whether their effects satisfy "total = direct + indirect" +- whether they support (n,1) and (n,) inputs + +To be robust to future updates, tests are adjusted with a smaller tolerance when possible. +The test is skipped if estimator has not been implemented yet, i.e. if ValueError is raised. +The test fails for any other unwanted behavior. +""" + +from pprint import pprint +import pytest +import os +import numpy as np + +from med_bench.get_estimation import get_estimation + +current_dir = os.path.dirname(__file__) +true_estimations_file_path = os.path.join(current_dir, 'tests_results.npy') +TRUE_ESTIMATIONS = np.load(true_estimations_file_path, allow_pickle=True) + + +@pytest.fixture(params=range(TRUE_ESTIMATIONS.shape[0])) +def tests_results_idx(request): + return request.param + + +@pytest.fixture +def data(tests_results_idx): + return TRUE_ESTIMATIONS[tests_results_idx] + + +@pytest.fixture +def estimator(data): + return data[0] + + +@pytest.fixture +def x(data): + return data[1] + + +# t is raveled because some estimators fail with (n,1) inputs +@pytest.fixture +def t(data): + return data[2] + + +@pytest.fixture +def m(data): + return data[3] + + +@pytest.fixture +def y(data): + return data[4] + + +@pytest.fixture +def config(data): + return data[5] + + +@pytest.fixture +def result(data): + return data[6] + + +@pytest.fixture +def effects_chap(x, t, m, y, estimator, config): + # try whether estimator is implemented or not + try: + res = get_estimation(x, t, m, y, estimator, config)[0:5] + except Exception as e: + if str(e) in ( + "Estimator only supports 1D binary mediator.", + "Estimator does not support 1D binary mediator.", + ): + pytest.skip(f"{e}") + + # We skip the test if an error with function from glmet rpy2 package occurs + elif "glmnet::glmnet" in str(e): + pytest.skip(f"{e}") + + else: + pytest.fail(f"{e}") + + # NaN situations + if np.all(np.isnan(res)): + pytest.xfail("all effects are NaN") + elif np.any(np.isnan(res)): + pprint("NaN found") + + return res + + +def test_estimation_exactness(result, effects_chap): + + assert np.all(effects_chap == pytest.approx(result)) diff --git a/src/tests/estimation/test_get_estimation.py b/src/tests/estimation/test_get_estimation.py index 395a04e..b50aaea 100644 --- a/src/tests/estimation/test_get_estimation.py +++ b/src/tests/estimation/test_get_estimation.py @@ -12,157 +12,14 @@ """ from pprint import pprint -import itertools import pytest import numpy as np -from numpy.random import default_rng + from med_bench.get_simulated_data import simulate_data from med_bench.get_estimation import get_estimation - -SMALL_ATE_TOLERANCE = 0.05 -SMALL_DIRECT_TOLERANCE = 0.05 -SMALL_INDIRECT_TOLERANCE = 0.2 - -MEDIUM_ATE_TOLERANCE = 0.10 -MEDIUM_DIRECT_TOLERANCE = 0.10 -MEDIUM_INDIRECT_TOLERANCE = 0.4 - -LARGE_ATE_TOLERANCE = 0.15 -LARGE_DIRECT_TOLERANCE = 0.15 -LARGE_INDIRECT_TOLERANCE = 0.8 -# indirect effect is weak, leading to a large relative error - -SMALL_TOLERANCE = np.array( - [ - SMALL_ATE_TOLERANCE, - SMALL_DIRECT_TOLERANCE, - SMALL_DIRECT_TOLERANCE, - SMALL_INDIRECT_TOLERANCE, - SMALL_INDIRECT_TOLERANCE, - ] -) - -MEDIUM_TOLERANCE = np.array( - [ - MEDIUM_ATE_TOLERANCE, - MEDIUM_DIRECT_TOLERANCE, - MEDIUM_DIRECT_TOLERANCE, - MEDIUM_INDIRECT_TOLERANCE, - MEDIUM_INDIRECT_TOLERANCE, - ] -) - -LARGE_TOLERANCE = np.array( - [ - LARGE_ATE_TOLERANCE, - LARGE_DIRECT_TOLERANCE, - LARGE_DIRECT_TOLERANCE, - LARGE_INDIRECT_TOLERANCE, - LARGE_INDIRECT_TOLERANCE, - ] -) - -INFINITE_TOLERANCE = np.array( - [ - np.inf, - np.inf, - np.inf, - np.inf, - np.inf, - ] -) - - -TOLERANCE_DICT = { - "coefficient_product": LARGE_TOLERANCE, - "mediation_ipw_noreg": INFINITE_TOLERANCE, - "mediation_ipw_reg": INFINITE_TOLERANCE, - "mediation_ipw_reg_calibration": INFINITE_TOLERANCE, - "mediation_ipw_forest": INFINITE_TOLERANCE, - "mediation_ipw_forest_calibration": INFINITE_TOLERANCE, - "mediation_g_computation_noreg": LARGE_TOLERANCE, - "mediation_g_computation_reg": MEDIUM_TOLERANCE, - "mediation_g_computation_reg_calibration": LARGE_TOLERANCE, - "mediation_g_computation_forest": LARGE_TOLERANCE, - "mediation_g_computation_forest_calibration": INFINITE_TOLERANCE, - "mediation_multiply_robust_noreg": INFINITE_TOLERANCE, - "mediation_multiply_robust_reg": LARGE_TOLERANCE, - "mediation_multiply_robust_reg_calibration": LARGE_TOLERANCE, - "mediation_multiply_robust_forest": INFINITE_TOLERANCE, - "mediation_multiply_robust_forest_calibration": LARGE_TOLERANCE, - "simulation_based": LARGE_TOLERANCE, - "mediation_DML": INFINITE_TOLERANCE, - "mediation_DML_reg_fixed_seed": INFINITE_TOLERANCE, - "mediation_g_estimator": SMALL_TOLERANCE, - "mediation_ipw_noreg_cf": INFINITE_TOLERANCE, - "mediation_ipw_reg_cf": INFINITE_TOLERANCE, - "mediation_ipw_reg_calibration_cf": INFINITE_TOLERANCE, - "mediation_ipw_forest_cf": INFINITE_TOLERANCE, - "mediation_ipw_forest_calibration_cf": INFINITE_TOLERANCE, - "mediation_g_computation_noreg_cf": SMALL_TOLERANCE, - "mediation_g_computation_reg_cf": LARGE_TOLERANCE, - "mediation_g_computation_reg_calibration_cf": LARGE_TOLERANCE, - "mediation_g_computation_forest_cf": INFINITE_TOLERANCE, - "mediation_g_computation_forest_calibration_cf": LARGE_TOLERANCE, - "mediation_multiply_robust_noreg_cf": MEDIUM_TOLERANCE, - "mediation_multiply_robust_reg_cf": LARGE_TOLERANCE, - "mediation_multiply_robust_reg_calibration_cf": MEDIUM_TOLERANCE, - "mediation_multiply_robust_forest_cf": INFINITE_TOLERANCE, - "mediation_multiply_robust_forest_calibration_cf": INFINITE_TOLERANCE, -} - - -PARAMETER_NAME = [ - "n", - "rg", - "mis_spec_m", - "mis_spec_y", - "dim_x", - "dim_m", - "seed", - "type_m", - "sigma_y", - "sigma_m", - "beta_t_factor", - "beta_m_factor", -] - -PARAMETER_LIST = list( - itertools.product( - [1000], - [default_rng(321)], - [False], - [False], - [1, 5], - [1], - [123], - ["binary"], - [0.5], - [0.5], - [0.5], - [0.5], - ) -) - -PARAMETER_LIST.extend( - list( - itertools.product( - [1000], - [default_rng(321)], - [False], - [False], - [1, 5], - [1, 5], - [123], - ["continuous"], - [0.5], - [0.5], - [0.5], - [0.5], - ) - ) -) +from med_bench.utils.utils import check_r_dependencies +from med_bench.utils.constants import PARAMETER_LIST, PARAMETER_NAME, TOLERANCE_DICT @pytest.fixture(params=PARAMETER_LIST) @@ -221,16 +78,36 @@ def config(dict_param): @pytest.fixture def effects_chap(x, t, m, y, estimator, config): # try whether estimator is implemented or not + + r_dependent_estimators = [ + "mediation_IPW_R", "simulation_based", "mediation_DML", "mediation_g_estimator" + ] + + if estimator in r_dependent_estimators and not check_r_dependencies(): + warning_message = ( + "R or some required R packages ('causalweight', 'mediation', 'stats', 'base', " + "'grf', 'plmed') not available" + ) + print(warning_message) + pytest.skip( + f"Skipping {estimator} as the required R environment/packages are not available." + ) + try: res = get_estimation(x, t, m, y, estimator, config)[0:5] - except ValueError as message_error: - if message_error.args[0] in ( + except Exception as e: + if str(e) in ( "Estimator only supports 1D binary mediator.", "Estimator does not support 1D binary mediator.", ): - pytest.skip(f"{message_error}") + pytest.skip(f"{e}") + + # We skip the test if an error with function from glmet rpy2 package occurs + elif "glmnet::glmnet" in str(e): + pytest.skip(f"{e}") + else: - pytest.fail(f"{message_error}") + pytest.fail(f"{e}") # NaN situations if np.all(np.isnan(res)): @@ -248,17 +125,20 @@ def test_tolerance(effects, effects_chap, tolerance): def test_total_is_direct_plus_indirect(effects_chap): if not np.isnan(effects_chap[1]): - assert effects_chap[0] == pytest.approx(effects_chap[1] + effects_chap[4]) + assert effects_chap[0] == pytest.approx( + effects_chap[1] + effects_chap[4]) if not np.isnan(effects_chap[2]): - assert effects_chap[0] == pytest.approx(effects_chap[2] + effects_chap[3]) + assert effects_chap[0] == pytest.approx( + effects_chap[2] + effects_chap[3]) -@pytest.mark.xfail +#@pytest.mark.xfail def test_robustness_to_ravel_format(data, estimator, config, effects_chap): if "forest" in estimator: pytest.skip("Forest estimator skipped") assert np.all( - get_estimation(data[0], data[1], data[2], data[3], estimator, config)[0:5] + get_estimation(data[0], data[1], data[2], + data[3], estimator, config)[0:5] == pytest.approx( effects_chap, nan_ok=True ) # effects_chap is obtained with data[1].ravel() and data[3].ravel() diff --git a/src/tests/estimation/tests_results.npy b/src/tests/estimation/tests_results.npy new file mode 100644 index 0000000..274ec88 Binary files /dev/null and b/src/tests/estimation/tests_results.npy differ diff --git a/src/tests/simulate_data/test_get_simulated_data.py b/src/tests/simulate_data/test_get_simulated_data.py index e13f4d8..b7e066f 100644 --- a/src/tests/simulate_data/test_get_simulated_data.py +++ b/src/tests/simulate_data/test_get_simulated_data.py @@ -14,45 +14,11 @@ """ from pprint import pprint -import itertools import pytest import numpy as np from numpy.random import default_rng from med_bench.get_simulated_data import simulate_data - - -PARAMETER_NAME = [ - "n", - "rg", - "mis_spec_m", - "mis_spec_y", - "dim_x", - "dim_m", - "seed", - "type_m", - "sigma_y", - "sigma_m", - "beta_t_factor", - "beta_m_factor", -] - - -PARAMETER_LIST = list( - itertools.product( - [1, 500, 1000], - [default_rng(321)], - [False, True], - [False, True], - [1, 5], - [1], - [123], - ["binary", "continuous"], - [0.5], - [0.5], - [0.5], - [0.5], - ) -) +from med_bench.utils.constants import PARAMETER_LIST, PARAMETER_NAME @pytest.fixture(params=PARAMETER_LIST)