diff --git a/.github/workflows/code-cov.yaml b/.github/workflows/code-cov.yaml index 8182928..551a319 100644 --- a/.github/workflows/code-cov.yaml +++ b/.github/workflows/code-cov.yaml @@ -39,6 +39,8 @@ jobs: dependencies: 'NA' install-pandoc: false packages: | + Matrix@1.6-5 + MASS@7.3-60 grf causalweight mediation @@ -53,6 +55,7 @@ jobs: - name: Run tests with coverage run: | + export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH} pytest --cov=med_bench --cov-report=xml - name: Upload coverage to Codecov diff --git a/.github/workflows/save-packages-cache.yaml b/.github/workflows/save-packages-cache.yaml deleted file mode 100644 index 9e7f0e0..0000000 --- a/.github/workflows/save-packages-cache.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: cache-R - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - test: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.11' # Specify the Python version you want to use - - - name: Install Package in Editable Mode with Python Dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" - - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - r-version: '4.3.2' # Use the R version you prefer - - - name: Install R packages - uses: r-lib/actions/setup-r-dependencies@v2 - with: - cache: true - cache-version: 1 - dependencies: 'NA' - install-pandoc: false - packages: | - grf - causalweight - mediation - - - name: Install plmed package - run: | - R -e "pak::pkg_install('ohines/plmed')" - - - name: Install Pytest - run: | - pip install pytest \ No newline at end of file diff --git a/.github/workflows/tests-with-R.yaml b/.github/workflows/tests-with-R.yaml deleted file mode 100644 index 086ddb5..0000000 --- a/.github/workflows/tests-with-R.yaml +++ /dev/null @@ -1,56 +0,0 @@ -name: CI-with-R - -on: - push: - branches: [ main ] - pull_request: - branches: - - main - - develop - -jobs: - test: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.11' # Specify the Python version you want to use - - - name: Install Package in Editable Mode with Python Dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" - - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - r-version: '4.3.2' # Use the R version you prefer - - - name: Install R packages - uses: r-lib/actions/setup-r-dependencies@v2 - with: - cache: true - cache-version: 1 - dependencies: 'NA' - install-pandoc: false - packages: | - grf - causalweight - mediation - - - name: Install plmed package - run: | - R -e "pak::pkg_install('ohines/plmed')" - - - name: Install Pytest - run: | - pip install pytest - - - name: Run tests - run: | - pytest \ No newline at end of file diff --git a/.github/workflows/tests-without-R.yaml b/.github/workflows/tests.yaml similarity index 97% rename from .github/workflows/tests-without-R.yaml rename to .github/workflows/tests.yaml index e8fb624..d725899 100644 --- a/.github/workflows/tests-without-R.yaml +++ b/.github/workflows/tests.yaml @@ -1,4 +1,4 @@ -name: CI-without-R +name: CI on: push: diff --git a/.gitignore b/.gitignore index 834d0a8..54eecfc 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,13 @@ dmypy.json # DS_STORE files src/.DS_Store .DS_Store + +# Mac +.idea/ + +# Ignore PDFs +*.pdf + +# Ignore local scripts + +scripts/ \ No newline at end of file diff --git a/README.md b/README.md index b95d123..6bba25b 100644 --- a/README.md +++ b/README.md @@ -21,26 +21,6 @@ pip install git+git://github.com/judithabk6/med_bench.git Installation time is a few minutes on a standard personal computer. -Some estimators rely on their R implementation which requires the installation of the corresponding R packages. This can be done using `rpy2` - -````python -import rpy2 -import rpy2.robjects.packages as rpackages -utils = rpackages.importr('utils') -utils.chooseCRANmirror(ind=33) - -utils.install_packages('grf') - -utils.install_packages('causalweight') - -utils.install_packages('mediation') - - -utils.install_packages('devtools') -devtools = rpackages.importr('devtools') -devtools.install_github('ohines/plmed') -plmed = rpackages.importr('plmed') -```` ## Content The `src` folder contains the main module with the implementation of the different estimators, the `script` folder contains the function used to simulate data and run the experiments, and the `results` folder contains all available results and code to reproduce the figures. diff --git a/docs/conf.py b/docs/conf.py index 40f79e8..4946e43 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -2,19 +2,24 @@ import sys import med_bench -import med_bench.get_estimation import med_bench.get_simulated_data -import med_bench.mediation +import med_bench.estimation +from med_bench.estimation.mediation_coefficient_product import CoefficientProduct +from med_bench.estimation.mediation_dml import DoubleMachineLearning +from med_bench.estimation.mediation_g_computation import GComputation +from med_bench.estimation.mediation_ipw import InversePropensityWeighting +from med_bench.estimation.mediation_mr import MultiplyRobust +from med_bench.estimation.mediation_tmle import TMLE import med_bench.utils import med_bench.utils.utils -import med_bench.utils.nuisances sys.path.insert(0, os.path.abspath('../')) project = 'med_bench' -copyright = '2024, Judith Abecassis, Houssam Zenati, Bertrand Thirion, Hadrien Mariaccia, Mouad Zbakh' -author = 'Judith Abecassis, Houssam Zenati, Bertrand Thirion, Hadrien Mariaccia, Mouad Zbakh' +copyright = '2025, Judith Abecassis, Houssam Zenati, Bertrand Thirion, Hadrien Mariaccia, Mouad Zbakh, Sami Boumaïza, Julie Josse' +author = 'Judith Abecassis, Houssam Zenati, Bertrand Thirion, Hadrien Mariaccia, Mouad Zbakh, Sami Boumaïza, Julie Josse' + # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/modules.rst b/docs/modules.rst index ae7f15f..4172470 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -1,57 +1,61 @@ .. toctree:: :maxdepth: 4 - :caption: Contents: + :caption: API: -med_bench -================= -.. automodule:: med_bench - :members: - :undoc-members: - :show-inheritance: -get_estimation --------- -.. automodule:: med_bench.get_estimation + +Estimation +========== +.. automodule:: med_bench.estimation :members: :undoc-members: :show-inheritance: -get_simulated_data --------- +.. automodule:: med_bench.estimation.mediation_coefficient_product + :members: + :undoc-members: -.. automodule:: med_bench.get_simulated_data + +.. automodule:: med_bench.estimation.mediation_g_computation :members: :undoc-members: - :show-inheritance: +.. automodule:: med_bench.estimation.mediation_ipw + :members: + :undoc-members: -mediation --------- -.. automodule:: med_bench.mediation +.. automodule:: med_bench.estimation.mediation_mr :members: :undoc-members: - :show-inheritance: -utils --------- +.. automodule:: med_bench.estimation.mediation_dml + :members: + :undoc-members: -.. automodule:: med_bench.utils.utils + + +.. automodule:: med_bench.estimation.mediation_tmle :members: :undoc-members: - :show-inheritance: -nuisances --------- -.. automodule:: med_bench.utils.nuisances +get_simulated_data +========== + +.. automodule:: med_bench.get_simulated_data :members: :undoc-members: :show-inheritance: + + + + + diff --git a/notebooks/noisymoons.pdf b/notebooks/noisymoons.pdf deleted file mode 100644 index bbe7b69..0000000 Binary files a/notebooks/noisymoons.pdf and /dev/null differ diff --git a/notebooks/toy_example.ipynb b/notebooks/toy_example.ipynb deleted file mode 100644 index 7d50e54..0000000 --- a/notebooks/toy_example.ipynb +++ /dev/null @@ -1,1221 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0abfb468", - "metadata": {}, - "source": [ - "# Causal mediation analysis \n", - "\n", - "The objective of this notebook is to develop a basic understanding of causal mediation analysis on a toy example." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "1a98b2a7", - "metadata": {}, - "outputs": [], - "source": [ - "import sklearn\n", - "from sklearn import cluster, datasets\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.preprocessing import StandardScaler\n", - "from itertools import cycle, islice\n", - "import numpy as np\n", - "from sklearn.linear_model import LinearRegression, LogisticRegression\n", - "from sklearn.linear_model import LogisticRegressionCV, RidgeCV, LassoCV\n", - "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor" - ] - }, - { - "cell_type": "markdown", - "id": "a61e59dd", - "metadata": {}, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "23644e4c", - "metadata": {}, - "outputs": [], - "source": [ - "rng = np.random.RandomState(170)\n", - "\n", - "### Create dataset features\n", - "\n", - "n_samples = 10000\n", - "X, l = datasets.make_moons(n_samples=n_samples, noise=0.05, random_state=170)\n", - "scaler = StandardScaler()\n", - "X = scaler.fit_transform(X)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "64ffb06f", - "metadata": {}, - "outputs": [], - "source": [ - "### Plot options\n", - "\n", - "colors = np.array(\n", - " list(\n", - " islice(\n", - " cycle(\n", - " [\n", - " \"#377eb8\",\n", - " \"#ff7f00\",\n", - " \"#4daf4a\",\n", - " \"#f781bf\",\n", - " \"#a65628\",\n", - " \"#984ea3\",\n", - " \"#999999\",\n", - " \"#e41a1c\",\n", - " \"#dede00\",\n", - " ]\n", - " ),\n", - " int(max(l) + 1),\n", - " )\n", - " )\n", - ")\n", - "\n", - "markers = np.array(\n", - " list(\n", - " islice(\n", - " cycle(\n", - " [\n", - " \".\",\n", - " \"+\",\n", - " \"x\",\n", - " \"v\",\n", - " \"s\",\n", - " \"p\",\n", - " ]\n", - " ),\n", - " int(max(l) + 1),\n", - " )\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e6a2752e", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAD7CAYAAABUt054AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABLaElEQVR4nO2dd3gc1dm37zPbVC3ZcpWLLBfh3jsGN2poMZAQQjEmEHCSF5LQS8B8SSAkQIAQICHEoWMIPXQMtjHFDdx7k6tsWVYvqy3z/TGa1Wi1q7az/dzXtZe0s7MzZ3dnfvPMc54iVFVFIpFIJJFHifYAJBKJJFmRAiyRSCRRQgqwRCKRRAkpwBKJRBIlpABLJBJJlJACLJFIJFFCCrAk4gghqoQQA6I9Dokk2ggZByyRSCTRQVrAEolEEiWkAEs6jBBinxDiZiHEBiFEuRBisRAipeG1a4UQu4QQJ4QQ7wohcg3vU4UQgxr+/4EQYosQolIIcUgIcXPD8k1CiPMM77EJIY4LIcYIIfo3bGO+EOKAEKJUCHG9EGJiw1jKhBBPGN6rCCHuFkIUCiGOCSGeF0JkGV4/XwixueF9S4UQQ9v4GbsKIf7X8L4TQogvhRDynJK0GXmwSELlx8BZQD4wCrhKCDEbeKDhtV5AIfBqkPc/C1ynqmomMAL4vGH588DlhvV+ABxRVXWdYdlkYDBwCfAocBdwGjAc+LEQYkbDelc1PGYBA4AM4AkAIUQB8Arwa6Ab8AHwnhDC3tJnbFh+E3Cw4X09gDsB6dOTtBkpwJJQeVxV1cOqqp4A3gPGAJcB/1ZV9TtVVZ3AHcBUIUT/AO93AcOEEJ1UVS1VVfW7huUvAj8QQnRqeH4F8ILfe3+vqmqdqqqfANXAK6qqHlNV9RDwJTC2Yb3LgEdUVd2jqmpVw3h+IoSwoon3+6qqfqqqqgt4CEgFprXyGfWx9wLyVFV1qar6pSonVSTtQAqwJFSKDP/XoFmXuWhWLwANolcC9A7w/ovQrNtCIcQyIcTUhvccBr4CLhJCZANnAy/5vfeo4f/aAM8zGv5vMp6G/61oVqv/WL3AAb+xBvqMAH8BdgGfCCH2CCFuD/D5JJKgSAGWhIPDQJ7+RAiRDuQAh/xXVFV1taqqFwDdgbeB1wwvP4fmhvgR8E2DZRvyeIB+gBtNsP3HKoC+gcYaYOyVqqrepKrqAOA84LdCiDkdHKMkCZECLAkHLwPzGybMHMD9wEpVVfcZVxJC2IUQlwkhshpu/ysAj2GVt4FxwI1oPuGO8grwGyFEvhAio2E8i1VVdaMJ/jlCiDlCCBuaX9cJfN3aRoUQ5wohBjWItj52Tytvk0h8SAGWmI6qqkuA3wFvAEeAgcBPgqx+BbBPCFEBXI9h4k1V1dqGbeQDb4YwpH+j+Y+XA3uBOuD/GvaxvWGffwOOo1my56mqWt+G7Q4GPgOqgG+AJ1VVXRrCOCVJhkzEkMQ0Qoh7gAJVVS9vdWWJJM6wRnsAEkkwhBBdgJ+hWckSScIhXRCSmEQIcS1aNMKHqqouj/Z4JJJwIF0QEolEEiWkBSyRSCRRQgqwRCKRRIl2TcJ17dpV7d+/f5iGAnUuDxW1Lk5UNY0AGtA9A7tVodrp5kBJjW95is1Cndsjs+87gMOq0KtzKieq6qmsc6GqgAABpNosdM9KofB4NaoKQoAiBB5VRQB9OqeRniLnbyWStrJ27drjqqp281/errOof//+rFmzxrRBVda6+MPbm9hXXMXsYT158et9ZHi85Pqtl5liZVz/zmw5VEFWpdO0/Sc7QkB+up3iquYhrwIYCXgNz/XrnFURfHTbLDJSbJEZqEQS5wghCgMtj6oL4rGPt/PVjmIKS2pY9OUeXB5vwPUq69ws21ZMsRRfU1FVqHS6A79Go/j641VVlmwqCvKqRCJpK1EV4GPldbi9gf0HJ/XMCLhcYi51rmAy24gQMKx3J99zrwqPfLiNtXtLwjk0iSThiaoAXzNrYNDX9h2vCfqaJLKoKvTMTuX8cb2xNBwxTreX/31/OLoDk0jinKjMpDhdHm5fvI5Vu0tIsSk4Xd5m82hOtxe7VaHe3bqFFq9YBHjiZAJx9e4Sbj9/OB9tOILHq/0mn206wlmjejF5UNdm65+ocvL+usN0SrVxzphcrBYZcBOruFwuDh48SF1dXbSHEvekpKTQp08fbLa2zY+0KxFjwoQJaiiTcNVONzc8v4bNB8sRQrOsLIpAEeCKFyVKYiwCLpjQh3fXHvK5jvp0SeVvV06kV+dU33puj5eLHvuSkionVkVwxshe3HnBiGgNW9IKe/fuJTMzk5ycHLTCbpKOoKoqJSUlVFZWkp+f3+Q1IcRaVVUn+L8nombJO2sPsvNIBaCJL4DHqyal+KY7LNEeQrvxqFBW7Wqy7OCJWuY+upy31xzwLSupclJWXY/bo1Ln8vLtruORHqqkHdTV1UnxNQEhBDk5Oe26k4ioC8IihDajg4r+WydbJrR+iNe747Ns7Jo9JQS6a/rL+1s5UFLD8L5ZnFLQjZxMB8UVTiyKYHpBs/BHSYwhxdcc2vs9mi7AXq/KS1/vY83eEs4elctZo7Wo3spaFxMGdGFEnyw2HihjfH4Xfnn6SVzxVKt1rxMGh00h3W7lRHU9Lj/9tSoQD+7uirrAYWueht/dblW4cno+i34+hY/WH6FTmo0zRvaK8Cglko7z9NNPk5aWxpVXXhl0nYULF5KRkcHNN98c0r5MF+D3vj/Is0t3Uefysr6wjJ7Zqbg8Xm55+TtUFcbnd+GLu05jR1Elq/eUkGpTqG1DKFQiYFUU8rulcaK6eeJDPIhvW6h3e/li61GumTWIS6bmtf4GiSTGuP766yO2L9N9wLuPVhliS1UKj1fz9Gc7qXN5cbq9rN17giue+pprn/mWxz/enlDim5Vm49ZzhxLoJqRvl1QW/XwKZ4zKxZLAd3sWRTBtsHQ5SNrHvn37GDp0KNdeey3Dhw/njDPOoLa2lnXr1jFlyhRGjRrF3LlzKS0tBWDmzJncdtttTJo0iYKCAr788ksANm/ezKRJkxgzZgyjRo1i586dADzyyCOMGDGCESNG8Oijj/r2+/zzzzNq1ChGjx7NFVdoZacXLlzIQw89BMAzzzzDxIkTGT16NBdddBE1NeaGx5omwAdKqvl4w2EmDcwh1WYh3WHFogiOltfisClYFE11nG4ve4ur4yb8qiVSbU2/vqG5nTh3bB/uu2hUE5G1KIIx/bvQr2s6UwZ1JcVujYoIp9vDP+d61qheLJgzOOz7kUSP4oo6bn3lO37+r5VsPFBm2nZ37tzJL3/5SzZv3kx2djZvvPEGV155JQ8++CAbNmxg5MiR3Hfffb713W43q1at4tFHH/Utf/rpp7nxxhtZt24da9asoU+fPqxdu5ZFixaxcuVKvv32W5555hm+//57Nm/ezB//+Ec+//xz1q9fz2OPPdZsTBdeeCGrV69m/fr1DB06lGeffda0zwsmCfDOogqueOobHnh3M7e+8j19ctK4aGJf3B6V51fsZfPBcvJy0onnUFBFgMOqqaYAUmwKZ47KJdVmwWZRsFoEq/eUMPuPn1Hn8rDi3jP44fg+2CyC/t3SuX62Jko9slJ56RfTuOOCEcwY0h0lgkJcXR/+u4331x3mgoeX8smGjjYwlsQ6d722nq+2F7PhQBk3PL+GOv8JjQ6Sn5/PmDFjABg/fjy7d++mrKyMGTNmADBv3jyWL2+szX/hhRf61t23bx8AU6dO5f777+fBBx+ksLCQ1NRUVqxYwdy5c0lPTycjI4MLL7yQL7/8ks8//5yLL76Yrl21OPYuXbo0G9OmTZs45ZRTGDlyJC+99BKbN2825bPqmOID/nJbMU6Xx5dMsbOokqKyWurdWoKFG5V9x6sIknUccyhCe2iVwARur4oiBHMn9GXGkO688s1+8runc93swVw6rT/f7jrO3z/ZQb1XBVT+8v5WzhvXm9vPH87t5w9vtv2e2amcO7Y354zJ5bt9J3hr9QEOl9bSMzuFz7ccC+tnMxbVCRfFVfXc88YmXB4v54ztG+a9SSJNUXmt7w7W7fFS43STYgs9rNLhcPj+t1gslJWVtWl9i8WC261NDv/0pz9l8uTJvP/++5x55pn861//Chi1A1rcbmtRC1dddRVvv/02o0eP5j//+Q9Lly5t+wdqA6bYpCfldsLhdzteVeducqLHi/jqvPPbmdx0zlDfZ3B7VSrqXNy2eD0rdx9n8beFfLThMHld07lkSh4p9sYD0G5t+l1UO91c+6+VTFv4CTc8v8aX3bfraCUPf7CNrYcruP60An56cj7dMg0HoSKwWxTsJvorQvkZrO08Wp75Yk8Ie5PEKlfPGIjDqpBiUzh1SHe6ZDhaf1MHyMrKonPnzj7/7gsvvOCzhoOxZ88eBgwYwA033MD555/Phg0bOPXUU3n77bepqamhurqat956i1NOOYU5c+bw2muvUVKi1TQ5ceJEs+1VVlbSq1cvXC4XL730kumfscMWsNerUlLlpHO6nZMLunHXBSN4dukuXw2HONPbJlgVwfMr9vDjyf3wGq6eH6470uRzfbzxCD8Y0xuAP/1kDAvf2IBXhXsvHNnkyvrW6gNsO1yOV1XZsL+Uj9Yf5vzxfbj91XUcKq0F4NZXvkMRgpp67XauZ1YK804ZwPgBXfjHkp18vvlo1L/T9kZqDM/t1PpKkrjjhxP6MmlgDtVON4N6ZIZ1X8899xzXX389NTU1DBgwgEWLFrW4/uLFi3nxxRex2Wz07NmTe+65hy5dunDVVVcxadIkAK655hrGjh0LwF133cWMGTOwWCyMHTuW//znP0229/vf/57JkyeTl5fHyJEjqaysNPXzdSgVubbezc//tYrCkmoyU6w8e+0Ueman8uf3NvPmmoOmDjBapNos3HH+cP7w9ibqA5TJtFsVfj57EJefnB/g3U15ccVe/vH5TlwelRSbwm/PHsr54/tw5oOfU16jZZZZFIHHcJtgVQTLf3c6iiJ4/ONtvL5yf1xlDA7ukc7Ci0Zz1+vrOVFVz3WzB3HRpH7RHpYkAFu3bmXo0KHRHkbCEOj7NDUVeenWYxw4UUO920tpdT2vrdRqDW85VNGRzYWVPp1T2xRxoE+w6XhULVvvzguG062Tg4wUq2/CzCLgiun5XDatf5vGMHdiX4bkZqEIwah+nX3JKb8+a4hvAu/qGQPo06WxnoIQsH6/FnJz9YyBjOvfhaxUG4N6ZDC6XxYpNoVUm4VOKVZf2NulU/O476IRAcPgIk1OZgoPvLuJwuJqKmpd/PXDbRwtr432sCSSmKJDLohMw0lvtShkp9l5Z+0B6lwerIrAahG4PGoTiy5aHC6rbdH/bFUEv/3BEIrK6nh+xV7f8pwMO6cO6Y7DZuGs0bkcLa/lNy+s5VBpLT+a1I9rZw1q8xjSHVaeuWZys+Vnj87l1CHdcXu8ZKXZOVhSw+FSbbwWRcHW4HTNSLHx2JVNL57f7TvBsYo6phd083WmqHa6WbRsNxeM78OPJvdjwaJVVNQGzlwLN9/ualor2O1VueaZlTy/YBqd0+1RGZNEEmt0SIBPLujG3Il9+WTjEUb1y2ZA9wzufn0DdS4Pdqtg6uCu7DhS6fNvRhOjhyXVJjh7TC7vrD2EVRGM6tuZR68Yj8WiUHi8mjdWH0BVvditFp752WQchpndHlmpvPyr6aaPL93R+BP88oyT2H2sigMnarhwQl+G984K+r5x/ZuHzNz92nrW7C3B7VF5f90hpgzOYdWuEzhjJM2uuNLJDc+v5oUFJ0d7KBJJTNAhARZCcMOZJ3HDmScB8PrKQrwNNWLr3SrLtxbjiaEqO7rroHN6Cr8+cyg/mzmY8hoX/bumozS8mNc1nTd/fQqFx6sZ2D0zKk0nu2Y6eH7BtA6/f+vhcp+f2OVR+Wr78ZhwRxjZWVTF4dJacg3lKyWSZKXDKlNV5+LWV75n2+EKRudlU2+YIIoV8RVA53S7r/bC4bJaLnvqa567bio53ZuHzmSl2RnVL/Zvj90eL59uKmLvsSqWbj2KIgS/mzuCc8f25r+r9vtSwWPAA9QMqyLYWVQhBVgiIQQBfnHFPjYeKMPlUVm9u3n8XCzQv1s6Q3Oz+HjDYV/g+LHyWpZsLuK8cX0AqK5zY7MqzWJ3Y5mFb2xkxY7iJhlIN7/0HR/cOotJA7vyzBc72XmkEhUVpzu2VNjtVQnSe1UiSTo6rDpOt8dnYQmhWTbRxn8EeV3TuPW8YYzO6+yrP6wIxedeePzjbZz54Oec8aclrNodPw0mv95Z3Cz9s7regxCCSQNz+MfVk3nosnGM6te5yTrZqVHpQNWMr7aHN9tPknysWbOGG264ocV1li5dyrnnnhuhEbWNDgvwZSfn06OTAwGM7JvN3Il96dslLSo+R0XAyL5ZvqgBnY0HykmxWfjr5eOZObQHndPtnD0ml5lDelBS6eT1lftxe7WuDQ+9vyUKI+8YI/tmY7coPt+2zSL41ekFvtcVRbBiRzHrCkt9yyxK7PSfK67UOgbUON0s+PdKpi38mJMXfsx1z66kqs7VyrslkuZMmDCBxx9/PNrDaDftEuCymnoWf1NIZa2LLYfKKKmqx6IIzhzVi5t+MJRLpkS+/muGXeGqUwfwxLyJjDFYfFYFTuqlZWI5bBYeuGQMH946i1vPHYaiiAaxbiyuY4xGiHUe/MlYrpsziGtnDeL1G6bz/i2z+LHfd19UVuubkEu1W7hgfF9q66MTkubPusJSXvu2kDdW7WfDgTK8qnZx2HSwnOe/3Nv6BiTRZ9FM7WES1dXVnHPOOYwePZoRI0awePFilixZwtixYxk5ciRXX301TqcTgNWrVzNt2jRGjx7NpEmTqKysbGLdrlq1imnTpjF27FimTZvG9u3bTRun2bRLdY6W1/H3z3bw9toDFJXV+sKbHnh3C9/vO8GX24sjmi5rswheu/FUumQ4OFZRh8frpVd2Cr07p1HQK5OfzWweq/vFliIe+XAbmSk2rpsziFe+3kdWmp2FF42K4MhDI8Vu4bJWMvDmnTKAVbtL8KrQLyedkX2ymvRtiyb1HpUnPtnBWaN74TX4g71eFZd0ECclH330Ebm5ubz//vsAlJeXM2LECJYsWUJBQQFXXnklTz31FL/4xS+45JJLWLx4MRMnTqSiooLU1KYTukOGDGH58uVYrVY+++wz7rzzTt54441ofKxWaZcAq6rW8eBASU0zV8NHG4pMHFbrpNgUBvXI9AX1/783N/J9YSleFUqq6jkptxMX/nU5ed3S+fNPxpKdbqfG6ebeNzZS7/ZSXOHk4/VHeP+WWREdd6QY2juLd2+aQXGlk75d0rj79fW+1xSh3fpEc37Oo3rJ65re5IJtUUSrFxZJlNGt3sJlTZ/PXxrSZkeOHMnNN9/MbbfdxrnnnkunTp3Iz8+noEBzrc2bN4+///3vzJkzh169ejFx4kQAOnVqXm+kvLycefPmsXPnToQQuFyx69ZqlwtCCM3qzO2Syjlje4drTK3SLyeN35w9hCfnT/IVvSmpcvomBVVV5fWVhZTXuthysJwnPt0BgMvjbVJcp8oZuz+MGWSk2MjvloHVorCzqNL3/dgsCg9dNp5RfbOwWQQpNguTB+REdGzpDhvLtx1rciHP755O18zwVNaSxDYFBQWsXbuWkSNHcscdd/DOO+8EXK8tJSR/97vfMWvWLDZt2sR7773Xri7FkaZdAtyjUwrXzyngz5eOpUuGnUkDutC3S1q4xhaUgydqeGftwSahY7864yQcDeFko/p1Rmn4kTyqSrVT831mpdm5dGp/rIrAYVW46QfJU4Dk4kn9SLEppNktDOyRyaSBOTx99WQevmwcT8wbz0OXjSUzgsknFbUu1u8vw3gu7TlaxRMfx66/ToJm6c5fCnkztIf+PEQOHz5MWloal19+OTfffDNff/01+/btY9euXUBjKcohQ4Zw+PBhVq9eDWjlIvVawDrl5eX07q0ZiP7VzWKNdp1x2el2LprYh/MeXkZVnRurRVDQMxOHVYlouqtXhf3Hm/ZmOrmgG+/dPJNqp5tumQ5uf3Ud3+w6TmaKletmN/qCf3l6AVdOz8dmVUwpIh0vXDqtP6PzOlNWXc+EATnUujxc+8y3HCytJTvNxr9/PpV/XzuZy576mvoI+iaMySIeFV5fVchpI3vSu3Maxyud9M1JwxrPrVQkbWLjxo3ccsstKIqCzWbjqaeeory8nB/96Ee43W4mTpzI9ddfj91uZ/Hixfzf//0ftbW1pKam8tlnnzXZ1q233sq8efN45JFHmD17dpQ+UdtoVznKzN4F6qhr/xbWcKZgHRsm5HfBZlFYtec4XhXOGNGT+y4e3eK2qupcpNqtvn50kkbeWXuQRz7citPlxaIIrpyej9Pt5fWVhVEte6ndmQzhrx9tR1W1ZJp//mxyXCXKxBuyHKW5hK0cpaqGP5ZUETSrlmVVBAO6Z9A104FAoKqwbNsxjlc6W9xWRopNim8QtIp22ndjUQSdUm18t+9Ei+IbCQ08uaAby7cdo7beQ53Lw/7j1XxfGJuZlhJJqMScWaEogsrappNjbq/KF1uOsudYFe6Ge1ZFCI5VxK5zPdaZObQH547tTU6GnRlDunPRpH6cOyYXh1Uh2BxHJLxMy7YdY+Wu477nNfUe/vDmRt7/PjEK/UskRtrlA1YEZKXZqKhxmRrvq4hGX+CQ3E7sK66msq6pY/14lZOLJ/Vl19FKLIqgd5c0Boe5HUoioyiCm88Zys3nNN4qXTw5j/zumRwpq+HRD7dR5TSn221byEixUFXnweNV8d9rcVU9v397M3uLq/nVGSdFbEwSSbhplwALIRiem8XqvSdMDZg3CvDmg+X8+9op/Hv5HnYWVXCkTLNyVRW+21fKi7+YxrEKJyP7ZjdLPZaExktf7eWZL3bTOd0eUfEFqKprfX+vflPI5Sfnky0LuptOW8K7JK3Tnjk1aKcLwuNV+XrXcVQ1fPeiXhVW7Cjm1nOHMbJvdpPXOqXa6JuTzvj8LgEnZY6U1bL5YBlumU3Vbo6V1/GPz3dR5/JQVFYb0HeuoMWBRwshCOoekXSclJQUSkpK2i0ekqaoqkpJSQkpKSltfk+HAj/N9gWm2Cx4gZoGq+tfS3fz3PI9XH5KPg6r1t4o3W5tMW73881F3PfmRhRFC417cv4kOQHXDtyGeDAVGNg9g26ZDnYfq+RYhZbk4gXwqs0aiIYT/Se0WxVuOPMkstI061dVVT7dVMS+Y1WcPqoX+d0yIjKeRKRPnz4cPHiQ4uLiaA8l7klJSaFPnz5tXj8mKtCclNuJAyU1PgEGcHlVXvt2Px/cOpuislr65qS3GIq0aPkeXyzy9iOV7C2uCnvL7EQit3Mql07N44UV+8hMsfKbs4dw26vrqPCbEEUFRGTjhBUBUwZ15cKJjV2VF39byNNLduJ0eXn120Jev+EUcmQWXYew2Wzk58sU8GgQEwL83d5SbBaBVRE+S0wA2Wk20h1WBrZBSPvlpLGvuAqXR0VFJSdDnoztZcFpBfx89mAUATuLKpu5chQBA7pnsOtoVUTH5VXhUGkNtfVuUu3aIfvNzuO+zh9CCHYdrZQCLIk7YmIWS0WrkGWzCP55zSQKemUyvE8WD102rs3buP284cwZ3pPR/bL5y6XjZOfdDmJRBEII+nVNJ91hxWFVsFkUemWnMLBHBhMH5BANz87+4zUsWLTa56ecM7wnKTYFm0WgiMbSoxJJPNGuTLiM3AJ1xLV/C89AgEkDc5q1X5dEj7LqepZsLqK40skrX+/D6faSYlNwurzNwhAFUNAzk+1FlWEbj0URfHzbLDJSbACs3HWcfcXVzBjanZ7ZssecJHYxJRPOTG4/byiptsbdWy2yFGGskZ1u56JJ/UizW3yuoXp3U/G1CEizW5gxtDsPXjqWjBRz62sMy+2E3apgUQQ9s1J477uDXPHU1zzywVbG53fhkql5UnwlcUtUfMCKgL3F1Zw5KpcP1h+m3u1FEYI+XeSJFIvMGtaD577cC6jU1jdOlOpp49VON19tL8arqk0mUttLToaNiloXXi/YrRbuuGAYFqGwt7gSVRUM7JHB/3trky9FObdzKj+Z2j/0DyiRRImoCLBAq0l7/ZzBeFSVvcequPKUAeR2jnxpS0nr9M1J57UbprPjSAV/fHsTx6vqAS05pqSq3mcRL98WWhhTSZUWcaEILeb73v9u9G07J91ORZ0Lt6fREj9cWhvS/iSSaBMVAc5MtXHlKQNItVu564IR0RiCpJ3kZDgY27+LT3whcNU6M/CqcNSvzkdJdeN+LYog1WZhaG4nzvnLUtxeL3ddMJxTh/QI04gkkvAQFR9wusNKp1RtIqW8pp4H39vMPf9dz/7j1dEYjqSNaBER0U9u6ZeTxlu/OZUnPt1BSZWT8hoXd7+2Hm+EkkMkErOIuAAL4Jwxub7nt726jve+O8Snm4q47t+rIpZhJWk/Qgiu8Wt0qgiwWwWd02wRG8eV0/PJTLX54oBBC2OslC3tJXFGxAW4RycH82cM9D3f21BiUlW1NjWx0jpdEphunZomO+RkOHj22ql8cOss5k7o06xZq9n8cHxvzh6jtZs5qVdjgo7NIli1pyTMe5dIzCXiApxit7B+fxmvfrOPvcVVzJ3YhxSbhVS7hYkDcnwxnpLYJNVubVKYfXjvLAb1yEAIwXWzB5Nq18LQwiXEH64/wi//s5pap5tJA7viaBiMRRH0kuFokjgj4okY4/pns+VQBR6vilVReG7BVMprXNTUu5mQnyML6MQ4tfVu5v/zW46W1yGAvK7pbD9SwfA+2Tx2xXgKj1fz8tf7qHN5+HJ7+Iq7jOqXzZNXTeSJT3ew6UAZF4zvw3nj2l4ERSKJJMESMSIuwIO6Z7DrmFZLIMWmdSaWJ0584fZ42V9Sw6rdx3mqoSCO3SK4/rQCfjqtPwBfbT/GTS9/H7Yx6JZvRoqVhy8bx5DcrLDtSyIJlahlwhnt2QyHlcun5+NoyIBTVRjeJzvcQ5CYjNWiMKB7RtOu0kI0+a1XbD8W1jE43V6cbi8lVfX87JmVPL1kh6xnK4k7wh4HrAKd06w8ffUk+nXN4ERVPVZF4GlwNUiHQ/xy9uhcPt9ylLV7TzCsdxY/nNB4J/P++iMRG4fHq7L4m/2M7d+FyQO7Rmy/EkmoRGQSrrTGzWOf7EAIwbJtx3B7VdxeFafby/vrDkViCJIw4LBZePzKCXx17xk8ffUkX6lIgK5hLg3ZOd3WpDuG2+ulzJCsIYk+CxatYsGiVdEeRkwTsSiIQyU1APTpkuZrh55iU8jrmh6pIUQMeeDB/FMHhLVsZWm1i/PH9vY9d3tV9hyLbJ1iiSRUwuaCsFnA49XSSq2K4NKGyZlJA3O48cwCPtlYxMSBOZwzpnfLG4pDdh5pLMmoC/FT8ydFazhR4cxRuby5+gBbD1f4lgnMTV9+57tDCKHNJagqrNlzwsStSzqKfsx/v6+0yfOWzoFkPU/CIsCdUq04XV68wotNEdxz4UhOH9HL9/rcif2Ya2gvkygsWLSKnUcqqXJqySSn3b+Emno3o/M6N1sPEvtgs1sV/v3zKfz5f1t4d+1BhBBcOLEv/121HzOTHfV5N4dVYfbwnuZtWCKJAKYLcKpNoV9OOpsOlgNgs4mQShTGEzuPVFJjyOTThfj7faVJIbr+CCG47bzh/OK0AmxWhRSbhS2Hyn3Hhmn7AW486yTmTuhr6nYlHUM/xttj+bbHWk4kTBVgAdS6vL4TTHcBjvJrL5+ILFi0ipp6d1Drbn1hKWl2KwsWrUq6gy0z1ca6wlLeWn2AyYNy2Hq4ImDND0XQIetYBYb1zkLInvVRIdhxrBskxvmQRD/W24upAux/7lwytR9njcolv3tytAwPJh6KgDS7lcG9krNL85GyWn79wlrqXC3fCXXUNWER0CXdgcvtxaIIFJlNGVGMcx5GBvfKZH1hKTuPVAY99ttjLSciYY0DvuLkAUnTqfap+ZOYtvDjgCLiVTV3RLADNdHZf7waSxjjbYbkduK5FXt4e81B7FaFhy8bx7j+XcK3QwnQKJq6q81o6RrnQqqc7qS762srYRPgS6b0Sxrx1Q8qMyeXEuVA3X6kgoVvbKDa6cFmEbg85merbT5UweZDWrRFbb2HB9/bzOL/O8X0/Uia4m9Q6M8H98r0iW8w/I/veD/OO0rIAqz77RSgU5qNn0zJ4+LJ/WRVM5r6NDMcmgviqfmTEkZc28ID726mtEar06uqcMs5Q3juy70cr3SaesEy4nR72bC/lFH9Ore+sqRD6Mfw4F6ZPusW8E1Cd9Sfn2yELMAvLJjGv5ftJtVu5ZenF9A53W7GuGIa/yQLXUhPu38J0HhLFugA1EPVgvnEEnlWWBGCU4f04MxRuTz+8XaWbjlKRZ359Z+PVzq54fm1XDo1j+vmDDZ9+xIN3aDQwy11dh6pDCq+tYVrAdimDgAS6/juCCF55rJTrewtruaPPx7D3T8cEVR8VVXliy1Hef7LPRwurQlllzFPTYCC8opoPFh1dh6pTIpsuTvPH05Ohh2bRfCzWQPplGrj1le+573vDoVtssztUalzefjP8j2c/qclLNt6NCz7SUb0LM/v95X6wiurnG7fPIf+1x9FaHeBT4pbeVLcGoWRxyYhl6NMsVm4fs4g0uxWhvfJYmCP5pbdK1/v4x+f78Ll8ZJmt/DfG08hKy3+LGV/61RnbH/tVld3L6wv1F7XrQBjFITxvYqArxee2eK+Es0y+N/3h3jo/S3UubwoQkugqDW0FgoHDqvCF3edJqMjTMD/HMhwWFv1945iEwJQHJk8VX8dADUig4P2AgruWAuLZmorzl8aplFHn2DlKEN2QdS5PPzt4+3YGuqz/v2qic1KTC7ffswXguRVYdfRKsbnJ9Ys9c4jlZx2/5KAB6MuxP7C7VU1t4W/dZyIrNx1nAMnapo0zrQogiun5/Pqt/sprw1fPzevLFNpGsawsUBRPQoN5zmWhudebuJJThJ7wNVYvtSh1tCnfkcERhzbhCzAVkWgovoaJH61o7iZAJ96Une2HqrA5dGsnkE94jMu2D9m0bg82AGpU1PvDjgxUeV0+yzmQPtKBP728XYWf1uIEFpH7HH9O7NmTykDemTQLyeDerf5FrDVIrAIgVdVue28YdL6DRPGu7oCdnOAXCx4qSKdwewmjVpNfAHUxjhwC17S1Cq4z9q4PAksYX9CEmAB3HbeMB76YCser5cUm8KIAAXWfzI1j57ZqRwoqea0ET3j0v3QEv6Tb8EINjGRZg97Weao8fXOYl7+Zp+vZoPT7eHy6QPISjvIh+uPcNd/14dlv3+4eBSnDukBIMU3DBgNhB0PjOd+5zWkqtV8IeYCsED9M7/haQp08Q2GmhxlCoIR0pnfrZOD88b1oVunFFZsP8bEATlMK+jWbD0hBLOG9QhlVzGFv3WqC3BLtBSSU+V0J6zPd8mmIoweAK9XJS8nnQ/DXLB9/f4yZg6TxXkixZPiVtJoLAf6G55u+5uFBewZSWX56oQkwHdfMIIdRyp4+et9pDusDOudHH25/F0QrVm+RjIc1oA1I1oKTYtnxvXvwpLNRdS5tDTh+388hvSU8Fv8dS4PO45UcPBEDX96bzOqCnecP1xWTAsDBXeshQeyod7is2hbtXyNqB6or9JcEEkmwiGdCY99sp2islqqnR4sAo6V1/Hsz6eYNbaYpqNpxcFCdBJ1Iu4HY3Kx2xQ2HyxnzvCejOybzeETNViEwBPGybFlW4/x0frDTSIs7np9PZ8OzJFJQmai+22dIVa46zc95KHEIyEJ8J6jVb4CPB4VDiV4jG+gEBwIPesnzW5tIr6J5I4QQnD6iF6+etCfbTrC79/ahBAqCuHLljoRoD2RqkJxpVMKsNkUrQvt/Y6spLN8ddotwJkpViobspdUIM1uQVW1UJ/Lp+ebPb6Yw2j5tsf1EIwMh5XP7pwDNHdtJCLPfLELZ0PUg8OqcN3sQXy+5ajpNYID0aOTg75d0sK+n6Ri/lLN/dARhKX1dRKcdgtwtwy7T4AB6uo9XD69P2eP7p3wZSeN4WZmiy80irt/dalEsIR16gwuAY9X5bxxfXh91X7Tti/Qoh70esOKgB9O6MvovGxmDOmBNZxl2ZKNRTM167ej7gfVo1m/PceYOKj4ot0CPH1odw5+XUh9Q1UrL/DppqP84vSTzB5bTKKLsH9SRUcY3CuzidVrhqjHMnX1Ho6V1/me260KVovgSFldC+9qHypgU0AgsFsVLIrgqlMG0D0rxbR9JDVmx+o6y6FwWVLGAEMHBPizjUd59Irx/PrF76h3e7EqgryuyXNbFwk3ge5bTiTLF7TkiFSHhRqnByGgZ3YqD763xfTKWXXuBuPA5eHhy8ZJ8Q0Hi2bC/hVJH8cbKu0W4BPV9XTJcPDk/In88/Nd5KTb+fXZQ8IxtoRGEU1Dz4xlKhMVq0Xh8Ssm8PAHW0m1W7jzghHc+PyasE3EeVV46au9TB3cPDZd0k50C7VwmfbXkWWe+CbxJFy7HWJ1Lg83v/wd2ak2Hr9yAvdeNCrhMtvaQobDSobDSjiSrJ6aPynhrF+dEX2zWXTdVJ6cP4k+XdI4Z2xv7Nbw+WXX7C3l0idWcLSsNmz7SErM9NvqMcBJSIeO/IMnapn3j29wttLjKxHZeaSS9YWlVDndVDm1lvMdEWG9bJ9e1u+0+5ckZcuiK6fnc/643tgsjV9iJ5MTNfYWV3PBX5fzztqDpm43qZi/VHvkzdAsVpBRDCbQ4SO93q1yvNJJ7yQJ6/Hvf6WTjKJpJg+8u5nPNhXh8aqk2BRmDetBbb2HpVuPmb6vxz/ezgXj+5i+3aSjvkqLfpD+35BplwVsaTD1FAG9slPokWSTG/5iqwitypkZxXR0i1ovcp3o/mCd1XtO4HR78apaosR1swfz1Y7isOzLEUZXR1KguwlUT+iZb0ZUjybo/m6IRTMT3jXRLuXokmHn4Z+OpazWxUwZU+kjUBcMSduYObQ77313CK+q0jndzvx/fhOWxp0AV88cEJbtSkzAWd4owkk0Idfujhhn3/Ys//jZJLpkJEfHY51A3TDa0g2gJXTfsTEKoKUuGYmI16vy+ZajlNfUs2zrUVbvOUG4KkRkOCy889uZESkGlNDomW9mWsHQPClDj7jIm6H9jWNhDtYRo90m7OGyWhZ/U2jOqOIIPTJhbH9t0k0voBMKXrV5/GuydZJVFMFpI3py0aR+pDnCK4xeVTt+JSHSc4z2cGSZNxEnLNo241hkO0K7BVgRYLclp+tBT0PWhXPnkUrTQ9EUkRw1IQLx67OGkJ0euFCOGUdcdpqdo+W1/PGdTbz//SHac/cnMaCLpLPcvIk4ox/YGHGRN6PxeQLSbpNjdF5nLp3aPwxDiT903284rNZErAPRGj2zU5k9vCdvrDrQ/EUBofgmbIrg7h+O4LcvraXO5eXTjUVYFMFZo3M7vlGJJETaZVgU9MrkiXkTw36rGKvoYhjId2sW+jaTNbztJ1PyfKnYRkI1Vi+fnt+kXGqdy8PGA2WhbTQZ0SMTdP+smfgndySw5avTLiVVRPL21goWBxwOEdarrekhaZA8lnDfnHT+d8tM3ly9n/8s34PT5cXt8RJqYERGio2JA3KwKAqpdoGqqpw2QnbH6BD7V4Rnu6HWFY5DktOUjUEyHFYG98r01Ycwo9pavJJis/DTafn8dFo+pz+wxFc/OBTeXrOfNXtKePiysRwpq6OgZyYDeyReC6iwo9f/NTsCApKyLKUU4DaiW6Bt7YDcXmrq3T7xNRbmSRbLF2D7kQrKauoZl9cFm1WhpNLZpPZ0KBw4UcuBE7Ws2VvC27+dQU6ShVGayh1lsNCku2H/KIoEdzn4k5zhDCEwuFcmg3tl+orxmE0yCa6Rxd8Uct2zK7nj1XX88rnVeL0qX2w9iqXD53mjz8JBHS9xHS9xHQXebWyOQPeNhMas7DRHFtzr1vrB2RO7mUMwpAC3E2OlMjMz4HRfstHyTSYxfvXbQupcXmrqPWw/XEFReR3ZabYO+tgb36TgYhwbGCgKGSgKeYh7KBB74dhm08aelBiL8nSUnmOadtXQC7MnePqxEemC6AALFq0KS/pxlVNzQyxYtCqpxBdgUI8MjlfW4fKoWC0KXdLtHbRUVRzUk0IdFWQyiH3cw198r2ap5SivzwbVC6fcCTN/Z96HSHRCbUHkjz6Zl6TWL0gB7hB6iFg4IiB0X3Cyce+FI3nqs52UVDm5esZAUuyab1CIlkLQVARetP7Kmq9iPOt4hHsQqHzOdPLZTxZVuFUFBa8WQuhuCEdbeg+4quH0P4X740kCoXo0H7Au6LpFnUR+YCnA7SBYKJoZGP3JoaY4xyMZKTZuOXdYk2VXnjKAZduOcbi0hsZMDE1oM6jktzxJJ6opJoe/M58a0vktT+MQLgDOVJeiR04qwbI4Vj4GExdAdl54PliioLsFzIx+EBbN/xuOmOI4QQpwCIRajMeIcTt6gXY9IiJZ6Zxu5xRlLe+QTx2p6OJrxcX/409ME2sBqFPtdOcY++lHPw6iqprl3KawdRVQ5GkQFfT0Y70ITxJZvjryyGsHuhgaazUkc7xuJDi98kXe4y4c1KEiKGA349jAZNb6hNZBPSeLtZzM2rZvODUHvC6Y9Xvo1Dt8HyBR0MXR7Epo9VXmbCdOkQIcAnq87vrC0pD9wRkOq29ib3Re56S2fH3s/5rhru94gV+wkvEUsIvhYgf+hm2HEjSz+8M134JFngLtor7K3E4Y/aZrf5PQ+gUZhtYh9BAxY3W0UKlyun1V1tYXNqYgJ1N3jGasfhIB9BZHuVB8EFB8O8zRDfCX7vDMFKg4bNZWEx9dMM0iCdOPjUgB7iC6+IZjQs6MFkcJQa9xYJBcUyuReF1QVwqH18AHvzJzy4mBfzzuopma+yGJJ8zCgRTgEAhXtIJeE2LBolW+rslJaQlPuRFm/A7Se5i8YYOUm93fLJEI1KfNbIytiJIQKcAdRHdD6OFjZhZlT8Y44IAoFph1H9xSBLP+YMIG9boDqhYCJSxg7wSZfeDRAfDuz8GT5P39jOUm/fu06V0wJKYh73U7SLhigvVWR/4RF0k/KTdxASz/PXicHd9GajbUV4OnDqwpMPcF8NbDOz/TEjI2vAi9xmr7SmaMftlwWKh6/O/+FVoW3B1l5m07zpAWsEmMzusc7SEkFlXH4MWz4W9DYNNrUFWk+W1DofYEoEKXwXDq3TBsrrZMbSh36anX9pPMVnAgS1cvEzl/qSaWoVrBqkezsHX3zwPZjeFtSUa7uiJPmDBBXbNmTRiHE3/ok3FgnjWs1wZOaqv31bmw43/gdYPFAeOugdV/N2HDCpz2AEy/VXtaWwr/mADVRWBLB3s6lBXC4B/AT95O3jA1ve6Df4KEWWUojeiCnsCWsGldkZOJtk58VTndproikjEVuRkVhzTxBc3tsPafJm3Yq7kfdFI7w/9tgwUbYNiPoPwAoGoW2o7/mbTPOES3hCH8Fcqc5dojySqhgRTgDmEUZn0iLhy1gZOaOfeDLQ0Um+YzDNX9YOSjX8PbV8Px7fDMZHi8AL5fpAm+aDglVBUsgTs0Jw3Gnmz7VyStmyCctEs16k1oDRMP6OKqpxkHmwgLZ3GepGfgafDbg3DwW3jtYnDVElJb5CaosPk12PMpVBzUFn35R21CqNswKNkBw38Eg842aX9xin/asdnhevpknE4SZsNJs60d+Auz2VavHsq2vrDUN6mX1FEQqZ1h8Nlw2Yfw7jVQvj+0KAgjqldzcxhx18OYeTD1N+bsI9EQFnPTkM3cVpzSLgWxW5PDYxEsBMzfH6z7as2oBQHB6wsna5F2H/uWmiu+AJ36QNneRj8zaBNuPUaZt494xb/0pC68Zgpm3gztbxJavUakBdwOAgmxWeLrz/f7Sply78e+50ktws7ypkJpBqV7tYpotccBATmDYcY9MGCOufuRNEdYkl54daQAt0BrYmdWIZ62kLTtilQVOg/UwsPqqw1WmAKEMCehuqHmqPa/LR1+9pXm8pA0iqMeigbm+X+FJalbEPmTHD4Fk4mEAD4pbuH5lDuTvlMGXz4An96qia8jE+b8SYsLNm1CDq2eZVmh9r/H1VIPpOSiaJ359Xr1C2iShZsFQ1rAHcTseg1PcxMO6vkFfwZgMHvIxOorzJO0iRk73tPShAFcdbDkTkKyfAOR1h26DoFP74Cv/6IJ/RWfQO+J5u4n3ug5pjEZQ6+CZsZEnCx+5EMKcAfRrVEzOmIMZjcD2cdBcnlXXA4qZFINTnh4v+aTvIklIe8nLjnpAji6UUsTVj2YLr7Dfwzn/ROqj8LKR7V91JXB/xbAdUma9albp7roGmtDmDURJ33AgBTgNuMfEaH/Pe1+TRhr6t0d8gc/yS2kUkumqGYoO7WFhmxPh1qDRYjktH4Bpt8GXQZB6R4QVlhyu7lJGeOvB4td6wvncz0IsDrM20e8E462QQ9kJ20fOCNSgENEt4TbUw9iMLs5SC4KHtKp5iSxJ+i6Fryau1O3SpLtgBUChl/c+Lxzf9i8GCwpsOnl0KMjXjwDOg+A676H0x+Ez38H6d3h/H+Ftt14xr9LhYzXDRtSgFuhtaw4/9C07/eVkkotD3MPf+V6djKw2TZv4kkeZgEZ1AQV38YG7BIf+5bBW1dqFnDfk80JTfO6ofKIlmo75UbtITHf6jX6jkVDXeZkMyYCIAXYJIwuiYFOTVSfRKu4dRP3UUsqd/IoCh5OEnt4gZbb4AhomqopD1b46DeNE3L7vjBvu65azeo7vFrzCecMNm/b8YR/AoaZGK1o1ZP03ZB1ZDnKNtKmlOBFM9lWeBhF9VDQYNl6VAUndrwoHKInAnyvtQtHVtO6rMmI3qnBbISloSawAEcG/ODvkN4NBp7RwZbLcYr/5JvZqcf+JEEZSh1ZjjJCDHEcayKwFuElTdSRIWooYE/HxBc0iyHJO8iS0ZNGx4xoiAc2AdWD5vTxgqsG3rsWFl8E715rzvbjBb36Wd4M7XGvO/wtiOqrkjomWApwG9F7wLWIoWOAJ8BXG5IxpXcPKFyW+HVT6yrgjcu0Qulb3mxcXnkIXwKGLQ1m/T8tgsFMvG5w12mujo0vm7vteEJvQ6TfdZmJsGjCfkeZ+W3u4wwpwGbTIIwWR2bjZIPZJLol/NGNsOW/cGQtvHk5lOyCog3Qf7Zm9dozoNtQmPJr89sHWVO0302xQffh5m47XjAWYw/HsWZsRVS4LDmMiiDISbhwYcweMhNhaYyfTNTQtNI9WuIFaAXSF50KVUfwxeee8RCMvRo2vISpiRm2dK1XXHmhJsAz7jFv2/GCXv8h3Nlq4TJO4gwpwGbRUvaQmdgzggtuogjyqb+DVy8AhBaT66vbq2rugf1fwYTroHC5eft0ZMEPF8HQueZtMxEI10ScPaOpeyPej9kOIgU4XJgdZqNHQRgtX13sE+3WbeBpcOMeLT63ZAe8Pc9g6ArYuwR2fgijr4B1i0zYoYABp0vxDVYHOFSM1q6+Pb3dPYTHzxwnSAE2i3CW8IOW0zb995cIlnBGD+3RczQcWgnrXwBnheaaqDysRSn89gDM+gN8cXdo+8qfDXOfM2fcksAYS1Dqx2myh1UiJ+HMx2zxdWRpIUG65av36TKGC/Uck7hWhBBw5sNw6zHoMpAmZShrS+GU2zXfrY4lVfPftpX82dpM/Ds/g12fNH3t+A7Nz1y6L5RPED8Yw9AcWebV7dUn3fSHTtG6xJ9QbgVpAZuNXsLPLPRbtUBuBn0/eiB7Ili+LTH7D1p4mhCQP0cT5EOrtdhdHU9t27ZlcWi+5o0vwbL7tGVbXoeLX9Each5eq03+6V2Sr/suuTLk6qsiUwNCjwNO1GO2FaQFbDbhOJCc5VqtgsJljbHAuiWcqJZvIIbOhRt2wlmPwsm3aMsOr6FDh7HHqQnv8a2Ny1QPbH1L+3/rm5qw11dpRdp3fhDq6OMD/SJupvgafcB5M2QcsAFpAZtNpCbEdAtFj6GExLciVBXeuQYOrND+H/4j2PQa0EGx8C9rKSww6Czt/96TtGQPVw0oFug1LqShxw3hCEHTxVxvxAmNlq//RHKiH8N+SAE2m3D4tPSiPPtXaAezHhERjjjjWMZZCXs/a6yCtuFl8Nabs+3OA2HOAzDiR9rzIRdoJSl3fQzDLoK8U8zZTyyzaGZ4iuQ4spqm0ie51WtECrBZhLuSVCCxTbbW3vZ0rZNxTbF2UcroARUHzNl26e7mbe9HXqo9koWideb7fYWl8Zww3rXpx26yHcN+SB9wvJKMs8eKBa76AnqNhx6j4JI3zS0WU7jCvG3FE3p0TSTKUOok4/EbAGkBm4UxDhiadpQN12xyMloN3zwCxzZp8cCvnAdj5sHKx0PfrlBg7FWNz1UVdvwPqos1F0RKmKuCJTrGpA5jUlGSIwU4HOhX93AJbzJFPvizZwm4G0LNnBVagkao9BgDc5+HniMbly25G1Y+Bqjw1YPwi81gScDTRZ8IC3dtBmNMsRRfH9IFYTZ6JalwiaQjqzFgPhkZdpGWeGFNhYxe0GMkEKJ4HNuoVVczsukVrSSlqwYqDmoFeiRtwxjt4MhqDDnTz4tkPXYDkICX9BhAP8AeyA5PQHsSB65z+p+hzxTNNTDiEs1V8MXvYPWTHd+mPUPzLxvpPwM2FWkTc/YMyOwd2rhjjUUzG6NqwNxjdP+Kpu20dJL1mG0BKcCS+EIIzQo2cs7f4ehG2P9l+7ZlcWgdkS94tnm1/HP/Ad1HQXURTPwF2FJCG3cyYRRzKbotIl0Q4aTnGPPy6Y0ES03WSabi1l6PFh+c3b/971W9cPKt0Hdq89esdpj2G61VfXZeyMOMSSIRj5ssx2EHkRZwvOEs127vZBgPHP4Onp+jTcaJDtgSXhfs/Vy7UPYaY/boJPIYbRUpwOFGL85jZoylXl3KP30zWJ3gRLoNVFX4+mHY/o5WlrKurGF5BztjbHwZtr4B166G7sO0ZauehA0vwoA5MPM+UBLsRtF4XCwMY9dn/4noRDweQ0QKcDgI1N7bkWV+oHsgCyPRrY6tb8LShVqEghmhU7q/cv3zmiVtS9cm9Nw1cHQ9ZPWD8QnWHVmPUY+Ee0CKbYtIAY4U4bCE/fP29ZrBResSN9znhCFlWAgQ1sbaEO0hoxfUlWqVzlSvFvPrrtO6LHsbag67arSOHImCXkFPPwbDXUvEf64ike/MOogU4HAwf6l2sOvZP8FqOYSKbr0ZRVffz/4ViRmuNvzHsOJP2mcXFk0wq48GXz89F6qPACrYMmD+cq3LRs1x2P0JlB/QEiyW/R6o0zLsLA6wdtLeP/bqSHyqyBCOQjuSkJACnAjo8ZxG90M4oi9igc794YZdWjpy9xGaiL4zv8GSDRDLWn0Eug7ROi0rDYVhtr0Fb16hvX7S+XDmX+HLBxo6a6hwwSLo1BtyToL0rpH8dOEhHDV+24L/XZi0fJshVFVtfa0GJkyYoK5ZsyaMw0lAjIXTjYHvZiMsjaKr32ImS6WpvUu1ehDb3mp9XT1aQp+0s6bAr7Zr4rz7E03Ue08M21Ajir+PN9LlSxf6aUsSC7AQYq2qqhP8l0sLOFHQIyPCndMfbSoOwconILULjJ0Pz58OR9cFX9/i0MLNdMFtFi0htEI7KVna9hKRcDeM9UfPgvOfj0hC4W0NKcDh5o6yyN4C9pueuJNwHjc8MxmqisBi0+o1FG8Ovr7FoU3UnXwbfPuoNsnma+opoNtwOOMviVfpzD8KR78L0wl3Mf9+0xsnhCUtkmABjkmO3j05Uaul1RzXHqpHE9OiDTQ5hBU7jJ6nCYA1RYuWcNdphXTuroEf/1dbLhRwdII+k2HAaVH7OBFHPy7CGaqo12d+ILt5D0MpyM2QFnAkCHdxHh097Mc/MSNRLOH07pBToFUvA8CjpSILK3QtgOwBWmdj1dsYmmZNhbRu8HAfqD4GA8/Qst+c5ZoF3W960zrAiYBufeqx55EKO9Pr/ErajBTgSGPPCJ//Te+enKgoCoy8DJbc3nR5Zi9YsBF+b2v08SpWyJ+tVU4rXKFlzaHC7o+Bhuwvr0eLBU4E2nqx1fuzmd31WHc7GMejJ3zYMzRXnKQZUoAjifEWMFwibM8InpKsE88W8aGVzZeld9fEOaMXVB7SltkzYchcrdDO/q8a11WskNoVao9rJSZHz4vMuCON/12QEbOPv0BFfXTxDZY2LwGkACceiX4LOOYqzYr1uEF1a/3hLn5Fe23eEvjnRO3EryuFj27UfL5nPQYndmkuiGm3wMx7tf/Tu2vrfvNXzVUxdj5YHVH9eO2mvfU/jK2yzEKf9DXuP5xRFgmEFOBI4H+SGDsGhCM22OgDNApyIqSCDjlfy2Yr3gIDTofMno2vZec3iEtDpIPq1dKJD6+Gmw423U5mL62wz79P0dKNhaLFAf/kzYh9lIjgH4IWDmFsaZv6pFw8HmsRQApwNAlHG3Bo6n/T95NI5I7XHv5Y7dB7kvZ59XoRtrSmFzwjrlpNyPXfYO+SsAw3rLRlwlXveByuGHG9TVag8UhaRApwJAh2kugNEc3Ef3tGIU6GbrTzlmiNOssLob4a+p2stS4KhC1Vy3wr2Q4omkWdqOgTZeGIhHCWw33WwJNtiXysmYAU4GgRKQvBeHtYX5V41rA/9nSYeH3b1hVCc2es+48mxvE8IRfM8jVGPEQyFVkKb5uQAhxJjAdlNIRQ9TTWgZUniEZKJ5hyQ7RHETrBXBDhzL7Uq/3pXY8l7UZmwkWLcLaub4lErZImacqimeE9vnTxhcgVd09ApAUcLXRL5b6GnyDSpQIliUGwMLRI3mFForlngiIFOBaIpPg6yxPfDyxptH7D4ffVS5/q+5DurA4jXRDRxp4RuRKSwtK4r2RqXZ/I6OLnyGosxhRuQdSz24rWyYt5iEgLONqE01IJhPHkkSQeRhdEuLPREj3rMgJIAY42xkppkUzf1PcVz1lxyY6//1eP+Q6nT9bofpDHTMhIF0QyEczXXLSueb1W6aKIP/SYX70Or9kIC9zrlpaviUgBjhV6jolsOyFHVmMKqTyh4hPd36vXWwj3ZK7qaYwhl9avKUgXRCyg124IZ63gQNRXNXV9FC6DhaJpjKd0UcQ2i2aGv928nsIu5w1MR1rAsYS/JeoIY6+ythblliddbKNfuMOJ3lZIthcyHWkBRxP/SRRoFN36qvBaNkbx1V0fugWup5fWV8nJllglknV3jXdEElORAhyL9BwTnjrBgQgm8sYJHemGSG7sGY2theQF2VSkAEeTlspUQmNTxXCii7yxZq7e4SCS1bMk7SNSF2gIby3hJEcKcKyhT8jpCRORwujr1a0cafnGDtH+LVSPjJYJA1KAY4H2to2RPrnkxXiBhsgeC8bebxJTkAIca0Tr1t8YFVG0TtYMjhX8J2r9XQGRvBDLOyPTkWFosYQe4B6shxk0poKGE71WREuhRjJTLnoEimAJJ8LSWOhHYirSAo5V9OB3f2s4XL5hf0tK3mqGjlmW4v4Vjf/7/06RsID1fTyQ3fSYlJZwyEgBjkWMB/R91sAWTzhPPGOXW38eyNb+ymI+iYnRpxyJKJwkRwpwrKNXtvLPSAvHiaEnX+j4uxikyLaNYF0qWvv+goUjRtLP67+vQJ205UXXNKQPONbxL3ziLA+fVaJnwTnLNUvXKPp6xTR9/8LS1FKWldRCRxY4TzqkBRwvRLpwuzETTkcG47eNYAk2wfBPK9bdPHqn4UUzI/e76y6IlibdpOVrGlKA4wVj4XaIXIZcsGXGiRl9LPdZm1Z0k7eqbcPf6tUvfnrMb7irnRmRoYgRRQpwvGGMTti/IvIlLI04y5tWbFM9kRWLWKcl8TJenPzvbowiGKk0YD28UT+WZBRMRJACHE8YT1rdEo6Fmg3G2XI9RllO3LQf/6w2/TuNZKhZoEk3SdiQAhyvxIqF4l9XWBcNOZkUmEAREvp3ZUywidRdjb/oyzuYiCIFOB7wP2mNfldHVnRrQwTar7A0XiBaC8lKRMs42GdaNLPRbaRjrP2hu3Mi6VLSaz9D42+WSL9FjCPD0OIRo5XSc4wWKxzO7hk6jixtdlwXfZ1g9QmSMRTNaNEGQq+pG4hwhhi2hLNcO6YKlzXWf5ZhhRFBWsDxgL+1aGw/7h8dEU70ljSgiW5LWXm6COlio9e38P8siZTWaqxmZyxkD00tXaPPPtq3/MY289GeS0hCpADHE7qoGf2s+kke6ROoNZdHR33U8SrEgUqJtqV8YzTLiuria4w3hsS+SMYYUoDjgZZu//STPJYmvYSluRj5V3hrrRtIqERKLNoyXuNn0wvrRKOer76/YJEqkogjBTie8I8X9beuYqVQe2vlMv0zvSA81lYkkgn0C98dZY0Cq/8G+u/zQHbwLtSR/r38xdf4Pft/T4k8URojSAGOZfxFSbci9Qk3s61Gswg0kaRb6sEK/Jht+erfWbhE2D992F98dSKdydYazvKW601LIopQVbXNK0+YMEFds2ZNGIcjaUIwAfYnXidP9AtJIEs4mGC25XV/94fZyQXGcDJ9P7Fy99EW8mY0b7zqP0kqMRUhxFpVVSf4L5cWcCzT2i1ga1ZjPIlCW9DFtaWJLf222ijC7ZkQDOQe8ccY4RFvFz//mF9JVJECHM8EC0/TRddffGNNkANZpW2xbAuXNXZnCLS+UYSN67TXlxlsgtAYhaKH4vWbHl9iLP27MYEU4HigPSdHS8V5olm4pzXuazgU73U3fy2QW6E1v6pxgqkt+E+U6c/1gvgQuCi+HlWgi3EsXeB0jC4HKbQxhRTgRMD/pNLFLBbFADShMgqbP4GsMv/bfWO2XWt1a4NFWPivZ8SYNGGcYAtWmSxWLmz+Vc2g5RBFKchRRaYiJxJ6yqjqaczx1ztXCEvszMarnsbuDwuF9tDHvFA0v5XXQ6T8U6DbQmupwfo6PccEv2AVrQvu1glXk9SO4p/qrE9A+idbSGICaQEnOrGUZmosW9maaOl+Xmg6IWYUmPZab/7irX8nrdXRiCWBbY36quA+cEnMIQU4kWhpYiVSnTRaor37NhZ896+r4D+LbxQcY60MXWRbciHo2woUxxtvqJ6mtaKNvnCZUhxzSAFOdPxn7uMNZ7nmljDSVjeE0fVgdCHoy/XJKf914h39wiUFNuaRApyIJPqJp4uosWeaLp5tqTRWX9U0XToRLF9/9M7WoYThScKOnIRLdPSTzejnNFqQkagjHA70eGBneXDxDLZcF189tC3RxNeIsWKeJOaQFnAy4F+PIJFb0DiyGi3iluJyAxXHidU43o6gRz8YkZZvzCEFOBkwnoi6z1O/DTe2wwlWsSueMPq62yO+La0fD/j/fvVVTZNXpPshJpECnMj4z34H67prJJGswGAkwufTEy70C2mwON9YCD+UBEUKsETDeNsuiX2CJYboyIm3uEAKcCLSUpcJ422p0do1FupOxKiARMSY1CKFNi6RURDJxPylmsDq6cn+7ghnedO0W0ls05YIlkCdLiQxg7SAE4m2ZDzNX9q8g3Iy+H1jjfZ+5/r6ejq3sGh+X/23lCIbl0gLOBm5o0x75M3QHve6tb/+oUvSHxw+2nvB011EuttB9WgX2EQLI0wypAWcSIQ68aJnTcVC3QhJUwIVVNLdRdIHHLdIAU5m/F0THUW6MCKDHnqmFxyKlSp3kg4jm3JKgqNbVTIqIvoYL3LGZqbS8o0LgjXlbJcACyGKgUIzByaRSCRJQJ6qqt38F7ZLgCUSiURiHjIKQiKRSKKEFGCJRCKJElKAJRKJJEpIAZZIJJIoIQVYIpFIooQUYIlEIokSUoAlEokkSkgBlkgkkighBVgikUiixP8HhOW1tdCcXZAAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "### Dataset visualisation\n", - "\n", - "reg = LogisticRegression().fit(X, l)\n", - "l_fitted = reg.predict(X)\n", - "l_fitted\n", - "markers = np.array(['.', '+'], dtype=str)\n", - "labels = ['nonsocial', 'social']\n", - "\n", - "# plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_fitted], marker=markers[y])\n", - "for i, c in enumerate(np.unique(l)):\n", - " plt.scatter(X[:,0][l==c],X[:,1][l==c],color=colors[l_fitted][l==c], marker=markers[i], label=labels[i])\n", - " \n", - "plt.plot()\n", - "plt.xlim(-1.5, 2.5)\n", - "plt.ylim(-1.5, 1.5)\n", - "plt.xticks(())\n", - "plt.yticks(())\n", - "plt.title('noisymoons')\n", - "plt.legend()\n", - "plt.savefig('noisymoons.pdf')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "55d8284a", - "metadata": {}, - "outputs": [], - "source": [ - "# Treatments and other quantities\n", - "\n", - "T = rng.choice([0,1], size=(n_samples,1))\n", - "np.mean(X, axis=0)\n", - "#mean_X = np.array([0.5, 0.25])\n", - "mean_X = np.array([0., 0.])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "1c590f2f", - "metadata": {}, - "outputs": [], - "source": [ - "# Coefficients\n", - "\n", - "reg = LinearRegression().fit(X, l)\n", - "reg.score(X, l), reg.coef_, reg.intercept_\n", - "beta_0 = reg.intercept_\n", - "beta_X = reg.coef_\n", - "\n", - "beta_T = np.array([1])\n", - "beta_TX = np.array([0,0])\n", - "omega_T = 0.9\n", - "\n", - "M_ = beta_0 + X.dot(beta_X) + omega_T*T.dot(beta_T) + (T*X).dot(beta_TX) + rng.normal(0, 0.1, size=T.shape[0])\n", - "M = np.expand_dims(M_, axis=-1)\n", - "\n", - "gamma_0 = 0\n", - "gamma_X = np.array([0,0]) \n", - "gamma_T = np.array([0.2])\n", - "gamma_M = np.array([1])\n", - "gamma_MT = np.array([0])\n", - "omega_M = 0.9\n", - "\n", - "Y = gamma_0 + X.dot(gamma_X) + T.dot(gamma_T) + omega_M*M.dot(gamma_M) + (T*M).dot(gamma_MT) + rng.normal(0, 0.1, size=T.shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "id": "a24af6c3", - "metadata": {}, - "outputs": [], - "source": [ - "((T*X).dot(beta_TX)).shape\n", - "linear_X = beta_0 + X.dot(beta_X)\n", - "linear_T = omega_T*T.dot(beta_T)\n", - "linear_TX = (T*X).dot(beta_TX)\n", - "noise = rng.normal(0, 0.1, size=T.shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "811a6446", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d5c31a25", - "metadata": {}, - "outputs": [], - "source": [ - "### Causal quantities\n", - "\n", - "mean_M_t1 = beta_0 + mean_X.dot(beta_X) + mean_X.dot(beta_TX) + omega_T *beta_T\n", - "mean_M_t0 = beta_0 + mean_X.dot(beta_X)\n", - "\n", - "theta_1 = gamma_T + gamma_MT.T.dot(mean_M_t1) # to do mean(m1) pour avoir un vecteur de taille dim_m\n", - "theta_0 = gamma_T + gamma_MT.T.dot(mean_M_t0)\n", - "#delta_1 = (gamma_T * t1 + m1.dot(gamma_m) + m1.dot(gamma_t_m) * t1 - gamma_t * t1 + m0.dot(gamma_m) + m0.dot(gamma_t_m) * t1).mean()\n", - "#delta_0 = (gamma_T * t0 + m1.dot(gamma_m) + m1.dot(gamma_t_m) * t0 - gamma_t * t0 + m0.dot(gamma_m) + m0.dot(gamma_t_m) * t0).mean()\n", - "delta_1 = (mean_X.dot(beta_TX) + omega_T * beta_T) * (omega_M * gamma_M + gamma_MT)\n", - "delta_0 = (mean_X.dot(beta_TX) + omega_T * beta_T) * (omega_M * gamma_M)\n", - "\n", - "tau = gamma_T + omega_M * gamma_M * omega_T * beta_T\n", - "tau" - ] - }, - { - "cell_type": "markdown", - "id": "9bd617ea", - "metadata": {}, - "source": [ - "## Causal effect estimation" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "fa63c8d4", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "CV_FOLDS = 5\n", - "ALPHAS = np.logspace(-5, 5, 8)" - ] - }, - { - "cell_type": "markdown", - "id": "1a5768c8", - "metadata": {}, - "source": [ - "### Importance weighting" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "cfdb2b86", - "metadata": {}, - "outputs": [], - "source": [ - "def get_classifier(regularization=False, forest=False, calibration=True, calib_method='sigmoid'):\n", - " if regularization:\n", - " cs = ALPHAS\n", - " else:\n", - " cs = [np.inf]\n", - " \n", - " if not forest:\n", - " x_clf = LogisticRegressionCV(Cs=cs, cv=CV_FOLDS)\n", - " xm_clf = LogisticRegressionCV(Cs=cs, cv=CV_FOLDS)\n", - " else:\n", - " x_clf = RandomForestClassifier(n_estimators=100,\n", - " min_samples_leaf=10)\n", - " xm_clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=10)\n", - " if calibration:\n", - " x_clf = CalibratedClassifierCV(x_clf,\n", - " method=calib_method)\n", - " xm_clf = CalibratedClassifierCV(xm_clf, method=calib_method)\n", - " \n", - " return x_clf, xm_clf\n", - " \n", - "def get_train_test_lists(crossfit, n): \n", - " if crossfit < 2:\n", - " train_test_list = [[np.arange(n), np.arange(n)]]\n", - " else:\n", - " kf = KFold(n_splits=crossfit)\n", - " train_test_list = list()\n", - " for train_index, test_index in kf.split(x):\n", - " train_test_list.append([train_index, test_index])\n", - " return train_test_list\n", - "\n", - "def estimate_probabilities(t, m, x, crossfit, classifier_x, classifier_xm):\n", - " \n", - " n = len(t)\n", - " train_test_list = [[np.arange(n), np.arange(n)]]\n", - " \n", - " p_x, p_xm = [np.zeros(n) for h in range(2)]\n", - " # compute propensity scores\n", - " if len(x.shape) == 1:\n", - " x = x.reshape(-1, 1)\n", - " if len(m.shape) == 1:\n", - " m = m.reshape(-1, 1)\n", - " \n", - " train_test_list = get_train_test_lists(crossfit, n)\n", - " \n", - " for train_index, test_index in train_test_list:\n", - " x_clf = classifier_x.fit(x[train_index, :], t[train_index])\n", - " xm_clf = classifier_xm.fit(np.hstack((x, m))[train_index, :], t[train_index])\n", - " p_x[test_index] = x_clf.predict_proba(x[test_index, :])[:, 1]\n", - " p_xm[test_index] = xm_clf.predict_proba(\n", - " np.hstack((x, m))[test_index, :])[:, 1]\n", - " \n", - " return p_x, p_xm\n", - "\n", - "def SNIPW(y, t, m, x, trim, p_x, p_xm):\n", - " \"\"\"\n", - " IPW estimator presented in\n", - " HUBER, Martin. Identifying causal mechanisms (primarily) based on inverse\n", - " probability weighting. Journal of Applied Econometrics, 2014,\n", - " vol. 29, no 6, p. 920-943.\n", - "\n", - " results has 6 values\n", - " - total effect\n", - " - direct effect treated (\\theta(1))\n", - " - direct effect non treated (\\theta(0))\n", - " - indirect effect treated (\\delta(1))\n", - " - indirect effect untreated (\\delta(0))\n", - " - number of used observations (non trimmed)\n", - "\n", - " y array-like, shape (n_samples)\n", - " outcome value for each unit, continuous\n", - "\n", - " t array-like, shape (n_samples)\n", - " treatment value for each unit, binary\n", - "\n", - " m array-like, shape (n_samples, n_features_mediator)\n", - " mediator value for each unit, can be continuous or binary, and\n", - " multi-dimensional\n", - "\n", - " x array-like, shape (n_samples, n_features_covariates)\n", - " covariates (potential confounders) values\n", - "\n", - "\n", - " trim float\n", - " Trimming rule for discarding observations with extreme propensity\n", - " scores. In the absence of post-treatment confounders (w=NULL),\n", - " observations with Pr(D=1|M,X)(1-trim) are\n", - " dropped. In the presence of post-treatment confounders\n", - " (w is defined), observations with Pr(D=1|M,W,X)(1-trim) are dropped.\n", - "\n", - " logit boolean\n", - " whether logit or pobit regression is used for propensity score\n", - " legacy from the R package, here only logit is implemented\n", - "\n", - " regularization boolean, default True\n", - " whether to use regularized models (logistic or\n", - " linear regression). If True, cross-validation is used\n", - " to chose among 8 potential log-spaced values between\n", - " 1e-5 and 1e5\n", - "\n", - " forest boolean, default False\n", - " whether to use a random forest model to estimate the propensity\n", - " scores instead of logistic regression\n", - "\n", - " crossfit integer, default 0\n", - " number of folds for cross-fitting\n", - "\n", - " clip float\n", - " limit to clip for numerical stability (min=clip, max=1-clip)\n", - " \"\"\"\n", - "\n", - " # trimming. Following causal weight code, not sure I understand\n", - " # why we trim only on p_xm and not on p_x\n", - " ind = ((p_xm > trim) & (p_xm < (1 - trim)))\n", - " y, t, p_x, p_xm = y[ind], t[ind], p_x[ind], p_xm[ind]\n", - "\n", - " # note on the names, ytmt' = Y(t, M(t')), the treatment needs to be\n", - " # binary but not the mediator\n", - " p_x = np.clip(p_x, clip, 1 - clip)\n", - " p_xm = np.clip(p_xm, clip, 1 - clip)\n", - "\n", - " y1m1 = np.sum(y * t / p_x) / np.sum(t / p_x)\n", - " y1m0 = np.sum(y * t * (1 - p_xm) / (p_xm * (1 - p_x))) /\\\n", - " np.sum(t * (1 - p_xm) / (p_xm * (1 - p_x)))\n", - " y0m0 = np.sum(y * (1 - t) / (1 - p_x)) /\\\n", - " np.sum((1 - t) / (1 - p_x))\n", - " y0m1 = np.sum(y * (1 - t) * p_xm / ((1 - p_xm) * p_x)) /\\\n", - " np.sum((1 - t) * p_xm / ((1 - p_xm) * p_x))\n", - "\n", - " return(y1m1 - y0m0,\n", - " y1m1 - y0m1,\n", - " y1m0 - y0m0,\n", - " y1m1 - y1m0,\n", - " y0m1 - y0m0,\n", - " np.sum(ind))\n", - "\n", - "def IPW(y, t, m, x, trim, p_x, p_xm):\n", - " \"\"\"\n", - " IPW estimator presented in\n", - " HUBER, Martin. Identifying causal mechanisms (primarily) based on inverse\n", - " probability weighting. Journal of Applied Econometrics, 2014,\n", - " vol. 29, no 6, p. 920-943.\n", - "\n", - " results has 6 values\n", - " - total effect\n", - " - direct effect treated (\\theta(1))\n", - " - direct effect non treated (\\theta(0))\n", - " - indirect effect treated (\\delta(1))\n", - " - indirect effect untreated (\\delta(0))\n", - " - number of used observations (non trimmed)\n", - "\n", - " y array-like, shape (n_samples)\n", - " outcome value for each unit, continuous\n", - "\n", - " t array-like, shape (n_samples)\n", - " treatment value for each unit, binary\n", - "\n", - " m array-like, shape (n_samples, n_features_mediator)\n", - " mediator value for each unit, can be continuous or binary, and\n", - " multi-dimensional\n", - "\n", - " x array-like, shape (n_samples, n_features_covariates)\n", - " covariates (potential confounders) values\n", - "\n", - "\n", - " trim float\n", - " Trimming rule for discarding observations with extreme propensity\n", - " scores. In the absence of post-treatment confounders (w=NULL),\n", - " observations with Pr(D=1|M,X)(1-trim) are\n", - " dropped. In the presence of post-treatment confounders\n", - " (w is defined), observations with Pr(D=1|M,W,X)(1-trim) are dropped.\n", - "\n", - " logit boolean\n", - " whether logit or pobit regression is used for propensity score\n", - " legacy from the R package, here only logit is implemented\n", - "\n", - " regularization boolean, default True\n", - " whether to use regularized models (logistic or\n", - " linear regression). If True, cross-validation is used\n", - " to chose among 8 potential log-spaced values between\n", - " 1e-5 and 1e5\n", - "\n", - " forest boolean, default False\n", - " whether to use a random forest model to estimate the propensity\n", - " scores instead of logistic regression\n", - "\n", - " crossfit integer, default 0\n", - " number of folds for cross-fitting\n", - "\n", - " clip float\n", - " limit to clip for numerical stability (min=clip, max=1-clip)\n", - " \"\"\"\n", - "\n", - " # trimming. Following causal weight code, not sure I understand\n", - " # why we trim only on p_xm and not on p_x\n", - " ind = ((p_xm > trim) & (p_xm < (1 - trim)))\n", - " y, t, p_x, p_xm = y[ind], t[ind], p_x[ind], p_xm[ind]\n", - "\n", - " # note on the names, ytmt' = Y(t, M(t')), the treatment needs to be\n", - " # binary but not the mediator\n", - " p_x = np.clip(p_x, clip, 1 - clip)\n", - " p_xm = np.clip(p_xm, clip, 1 - clip)\n", - "\n", - " y1m1 = np.mean(y * t / p_x) \n", - " y1m0 = np.mean(y * t * (1 - p_xm) / (p_xm * (1 - p_x))) \n", - " y0m0 = np.mean(y * (1 - t) / (1 - p_x)) \n", - " y0m1 = np.mean(y * (1 - t) * p_xm / ((1 - p_xm) * p_x)) \n", - "\n", - " return(y1m1 - y0m0,\n", - " y1m1 - y0m1,\n", - " y1m0 - y0m0,\n", - " y1m1 - y1m0,\n", - " y0m1 - y0m0,\n", - " np.sum(ind))\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "cf795077", - "metadata": {}, - "outputs": [], - "source": [ - "y = Y\n", - "t = T\n", - "m = M\n", - "x = X\n", - "trim=0\n", - "logit=True\n", - "regularization=False\n", - "forest=False\n", - "crossfit=0\n", - "clip=0.0\n", - "calibration=False\n", - "classifier_x, classifier_xm = get_classifier(regularization, forest, calibration)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "60063b02", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n", - "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - } - ], - "source": [ - "p_x, p_xm = estimate_probabilities(t, m, x, crossfit, classifier_x, classifier_xm)\n", - "effects_IPW = IPW(y, t, m, x, trim, p_x, p_xm)\n", - "effects_SNIPW = SNIPW(y, t, m, x, trim, p_x, p_xm)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "d6e0cf37", - "metadata": {}, - "outputs": [], - "source": [ - "tau_hat, theta_1_hat, theta_0_hat, delta_1_hat, delta_0_hat, n_non_trimmed = effects_IPW" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "69e45369", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "IPW\n", - "Direct effects\n", - "True theta1:[0.2], estimated theta1: -478403411903829.6\n", - "True theta0:[0.2], estimated theta0: -5.960267411051258e+25\n", - "\\\n", - "Indirect effects\n", - "True delta1:[0.81], estimated delta1: 5.960267411051258e+25\n", - "True delta0:[0.81], estimated delta0: 478403411903829.1\n", - "\\\n", - "Total effect\n", - "True tau:[1.01], estimated tau: -0.489\n" - ] - } - ], - "source": [ - "print(\"IPW\")\n", - "print(\"Direct effects\")\n", - "print(\"True theta1:{}, estimated theta1: {}\".format(theta_1, round(theta_1_hat,3)))\n", - "print(\"True theta0:{}, estimated theta0: {}\".format(theta_0, round(theta_0_hat,3)))\n", - "print(\"\\\\\")\n", - "print(\"Indirect effects\")\n", - "print(\"True delta1:{}, estimated delta1: {}\".format(delta_1, round(delta_1_hat,3)))\n", - "print(\"True delta0:{}, estimated delta0: {}\".format(delta_0, round(delta_0_hat,3)))\n", - "print(\"\\\\\")\n", - "print(\"Total effect\")\n", - "print(\"True tau:{}, estimated tau: {}\".format(tau, round(tau_hat,3)))" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "c5ad6863", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SNIPW\n", - "Direct effects\n", - "True theta1:[0.2], estimated theta1: -0.694\n", - "True theta0:[0.2], estimated theta0: -0.903\n", - "\\\n", - "Indirect effects\n", - "True delta1:[0.81], estimated delta1: 0.893\n", - "True delta0:[0.81], estimated delta0: 0.684\n", - "\\\n", - "Total effect\n", - "True tau:[1.01], estimated tau: -0.01\n" - ] - } - ], - "source": [ - "tau_hat, theta_1_hat, theta_0_hat, delta_1_hat, delta_0_hat, n_non_trimmed = effects_SNIPW\n", - "print(\"SNIPW\")\n", - "print(\"Direct effects\")\n", - "print(\"True theta1:{}, estimated theta1: {}\".format(theta_1, round(theta_1_hat,3)))\n", - "print(\"True theta0:{}, estimated theta0: {}\".format(theta_0, round(theta_0_hat,3)))\n", - "print(\"\\\\\")\n", - "print(\"Indirect effects\")\n", - "print(\"True delta1:{}, estimated delta1: {}\".format(delta_1, round(delta_1_hat,3)))\n", - "print(\"True delta0:{}, estimated delta0: {}\".format(delta_0, round(delta_0_hat,3)))\n", - "print(\"\\\\\")\n", - "print(\"Total effect\")\n", - "print(\"True tau:{}, estimated tau: {}\".format(tau, round(tau_hat,3)))" - ] - }, - { - "cell_type": "markdown", - "id": "e5147842", - "metadata": {}, - "source": [ - "### Ordinary least squares" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1eaa1b0b", - "metadata": {}, - "outputs": [], - "source": [ - "def ols_mediation(y, t, m, x, interaction=False, regularization=True):\n", - " \"\"\"\n", - " found an R implementation https://cran.r-project.org/package=regmedint\n", - "\n", - " implements very simple model of mediation\n", - " Y ~ X + T + M\n", - " M ~ X + T\n", - " estimation method is product of coefficients\n", - "\n", - " y array-like, shape (n_samples)\n", - " outcome value for each unit, continuous\n", - "\n", - " t array-like, shape (n_samples)\n", - " treatment value for each unit, binary\n", - "\n", - " m array-like, shape (n_samples)\n", - " mediator value for each unit, can be continuous or binary, and\n", - " is necessary in 1D\n", - "\n", - " x array-like, shape (n_samples, n_features_covariates)\n", - " covariates (potential confounders) values\n", - "\n", - " interaction boolean, default=False\n", - " whether to include interaction terms in the model\n", - " not implemented here, just for compatibility of signature\n", - " function\n", - "\n", - " regularization boolean, default True\n", - " whether to use regularized models (logistic or\n", - " linear regression). If True, cross-validation is used\n", - " to chose among 8 potential log-spaced values between\n", - " 1e-5 and 1e5\n", - "\n", - " \"\"\"\n", - " if regularization:\n", - " alphas = ALPHAS\n", - " else:\n", - " alphas = [0.0]\n", - " if len(x.shape) == 1:\n", - " x = x.reshape(-1, 1)\n", - " if len(m.shape) == 1:\n", - " m = m.reshape(-1, 1)\n", - " if len(t.shape) == 1:\n", - " t = t.reshape(-1, 1)\n", - " coef_t_m = np.zeros(m.shape[1])\n", - " for i in range(m.shape[1]):\n", - " m_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\\\n", - " .fit(np.hstack((x, t)), m[:, i])\n", - " coef_t_m[i] = m_reg.coef_[-1]\n", - " y_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\\\n", - " .fit(np.hstack((x, t, m)), y.ravel())\n", - "\n", - " # return total, direct and indirect effect\n", - " direct_effect = y_reg.coef_[x.shape[1]]\n", - " indirect_effect = sum(y_reg.coef_[x.shape[1] + 1:] * coef_t_m)\n", - " return [direct_effect + indirect_effect,\n", - " direct_effect,\n", - " direct_effect,\n", - " indirect_effect,\n", - " indirect_effect,\n", - " None]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "id": "d8c15b96", - "metadata": {}, - "outputs": [], - "source": [ - "def get_regressions(regularization=False, interaction=False, forest=False, calibration=True, calib_method='sigmoid'):\n", - " if regularization:\n", - " cs = ALPHAS\n", - " else:\n", - " cs = [np.inf]\n", - " \n", - " # mu_tm, f_mtx, and p_x model fitting\n", - " if not forest:\n", - " y_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\n", - " pre_m_prob = LogisticRegressionCV(Cs=cs, cv=CV_FOLDS).fit(\n", - " get_interactions(interaction, t, x)[train_index, :], m[train_index]\n", - " )\n", - " pre_p_x_clf = LogisticRegressionCV(Cs=cs, cv=CV_FOLDS).fit(\n", - " x[train_index, :], t[train_index]\n", - " )\n", - " else:\n", - " y_reg = RandomForestRegressor(n_estimators=100, min_samples_leaf=10)\n", - " pre_m_prob = RandomForestClassifier(\n", - " n_estimators=100, min_samples_leaf=10\n", - " )\n", - " pre_p_x_clf = RandomForestClassifier(\n", - " n_estimators=100, min_samples_leaf=10\n", - " )\n", - " if calibration:\n", - " m_prob = CalibratedClassifierCV(pre_m_prob, method=calib_method)\n", - " p_x_clf = CalibratedClassifierCV(pre_p_x_clf, method=calib_method)\n", - " else:\n", - " m_prob = pre_m_prob\n", - " p_x_clf = pre_p_x_clf\n", - " \n", - " return y_reg, m_prob, p_x_clf\n", - " \n", - "def get_train_test_lists(crossfit, n): \n", - " if crossfit < 2:\n", - " train_test_list = [[np.arange(n), np.arange(n)]]\n", - " else:\n", - " kf = KFold(n_splits=crossfit)\n", - " train_test_list = list(kf.split(x))\n", - " \n", - " return train_test_list\n", - "\n", - "def estimate_probabilities(t, m, x, crossfit, classifier_x, classifier_xm):\n", - " \n", - " n = len(t)\n", - " train_test_list = [[np.arange(n), np.arange(n)]]\n", - " \n", - " p_x, p_xm = [np.zeros(n) for h in range(2)]\n", - " # compute propensity scores\n", - " if len(x.shape) == 1:\n", - " x = x.reshape(-1, 1)\n", - " if len(m.shape) == 1:\n", - " m = m.reshape(-1, 1)\n", - " \n", - " train_test_list = get_train_test_lists(crossfit, n)\n", - " \n", - " for train_index, test_index in train_test_list:\n", - " x_clf = classifier_x.fit(x[train_index, :], t[train_index])\n", - " xm_clf = classifier_xm.fit(np.hstack((x, m))[train_index, :], t[train_index])\n", - " p_x[test_index] = x_clf.predict_proba(x[test_index, :])[:, 1]\n", - " p_xm[test_index] = xm_clf.predict_proba(\n", - " np.hstack((x, m))[test_index, :])[:, 1]\n", - " \n", - " return p_x, p_xm\n", - "\n", - "def get_interactions(interaction, *args):\n", - " \"\"\"\n", - " this function provides interaction terms between different groups of\n", - " variables (confounders, treatment, mediators)\n", - " Inputs\n", - " --------\n", - " interaction boolean\n", - " whether to compute interaction terms\n", - "\n", - " *args flexible, one or several arrays\n", - " blocks of variables between which interactions should be\n", - " computed\n", - " Returns\n", - " --------\n", - " Examples\n", - " --------\n", - " >>> x = np.arange(6).reshape(3, 2)\n", - " >>> t = np.ones((3, 1))\n", - " >>> m = 2 * np.ones((3, 1))\n", - " >>> get_interactions(False, x, t, m)\n", - " array([[0., 1., 1., 2.],\n", - " [2., 3., 1., 2.],\n", - " [4., 5., 1., 2.]])\n", - " >>> get_interactions(True, x, t, m)\n", - " array([[ 0., 1., 1., 2., 0., 1., 0., 2., 2.],\n", - " [ 2., 3., 1., 2., 2., 3., 4., 6., 2.],\n", - " [ 4., 5., 1., 2., 4., 5., 8., 10., 2.]])\n", - " \"\"\"\n", - " variables = list(args)\n", - " for index, var in enumerate(variables):\n", - " if len(var.shape) == 1:\n", - " variables[index] = var.reshape(-1,1)\n", - " pre_inter_variables = np.hstack(variables)\n", - " if not interaction:\n", - " return pre_inter_variables\n", - " new_cols = list()\n", - " for i, var in enumerate(variables[:]):\n", - " for j, var2 in enumerate(variables[i+1:]):\n", - " for ii in range(var.shape[1]):\n", - " for jj in range(var2.shape[1]):\n", - " new_cols.append((var[:, ii] * var2[:, jj]).reshape(-1, 1))\n", - " new_vars = np.hstack(new_cols)\n", - " result = np.hstack((pre_inter_variables, new_vars))\n", - " return result\n", - "\n", - "def multiply_robust_efficient(\n", - " y,\n", - " t,\n", - " m,\n", - " x,\n", - " interaction=False,\n", - " forest=False,\n", - " crossfit=0,\n", - " trim=0.01,\n", - " regularization=True,\n", - " calibration=True,\n", - " calib_method=\"sigmoid\",\n", - "):\n", - " \"\"\"\n", - " Presented in Eric J. Tchetgen Tchetgen. Ilya Shpitser.\n", - " \"Semiparametric theory for causal mediation analysis: Efficiency bounds,\n", - " multiple robustness and sensitivity analysis.\"\n", - " Ann. Statist. 40 (3) 1816 - 1845, June 2012.\n", - " https://doi.org/10.1214/12-AOS990\n", - "\n", - " Parameters\n", - " ----------\n", - " y : array-like, shape (n_samples)\n", - " Outcome value for each unit, continuous\n", - "\n", - " t : array-like, shape (n_samples)\n", - " Treatment value for each unit, binary\n", - "\n", - " m : array-like, shape (n_samples)\n", - " Mediator value for each unit, binary and unidimensional\n", - "\n", - " x : array-like, shape (n_samples, n_features_covariates)\n", - " Covariates value for each unit, continuous\n", - "\n", - " interaction : boolean, default=False\n", - " Whether to include interaction terms in the model\n", - " interactions are terms XT, TM, MX\n", - "\n", - " forest : boolean, default=False\n", - " Whether to use a random forest model to estimate the propensity\n", - " scores instead of logistic regression, and outcome model instead\n", - " of linear regression\n", - "\n", - " crossfit : integer, default=0\n", - " Number of folds for cross-fitting. If crossfit<2, no cross-fitting is applied\n", - "\n", - " trim : float, default=0.01\n", - " Limit to trim p_x and f_mtx for numerical stability (min=trim, max=1-trim)\n", - "\n", - " regularization : boolean, default=True\n", - " Whether to use regularized models (logistic or linear regression).\n", - " If True, cross-validation is used to chose among 8 potential\n", - " log-spaced values between 1e-5 and 1e5\n", - "\n", - " calibration : boolean, default=True\n", - " Whether to add a calibration step so that the classifier used to estimate\n", - " the treatment propensity score and the density of the (binary) mediator.\n", - " Calibration ensures the output of the [predict_proba](https://scikit-learn.org/stable/glossary.html#term-predict_proba)\n", - " method can be directly interpreted as a confidence level.\n", - "\n", - " calib_method : str, default=\"sigmoid\"\n", - " Which calibration method to use.\n", - " Implemented calibration methods are \"sigmoid\" and \"isotonic\".\n", - "\n", - "\n", - " Returns\n", - " -------\n", - " total : float\n", - " Average total effect.\n", - " direct1 : float\n", - " Direct effect on the exposed.\n", - " direct0 : float\n", - " Direct effect on the unexposed,\n", - " indirect1 : float\n", - " Indirect effect on the exposed.\n", - " indirect0 : float\n", - " Indirect effect on the unexposed.\n", - " n_discarded : int\n", - " Number of discarded samples due to trimming.\n", - "\n", - "\n", - " Raises\n", - " ------\n", - " ValueError\n", - " - If t or y are multidimensional.\n", - " - If x, t, m, or y don't have the same length.\n", - " - If m is not binary.\n", - " \"\"\"\n", - " # Format checking\n", - " if len(y) != len(y.ravel()):\n", - " raise ValueError(\"Multidimensional y is not supported\")\n", - " if len(t) != len(t.ravel()):\n", - " raise ValueError(\"Multidimensional t is not supported\")\n", - " if len(m) != len(m.ravel()):\n", - " raise ValueError(\"Multidimensional m is not supported\")\n", - "\n", - " n = len(y)\n", - " if len(x.shape) == 1:\n", - " x.reshape(n, 1)\n", - " if len(m.shape) == 1:\n", - " m.reshape(n, 1)\n", - "\n", - " dim_m = m.shape[1]\n", - " if n * dim_m != sum(m.ravel() == 1) + sum(m.ravel() == 0):\n", - " raise ValueError(\"m is not binary\")\n", - "\n", - " y = y.ravel()\n", - " t = t.ravel()\n", - " m = m.ravel()\n", - " if n != len(x) or n != len(m) or n != len(t):\n", - " raise ValueError(\"Inputs don't have the same number of observations\")\n", - "\n", - " # Initialisation\n", - " (\n", - " p_x, # P(T=1|X)\n", - " f_00x, # f(M=0|T=0,X)\n", - " f_01x, # f(M=0|T=1,X)\n", - " f_10x, # f(M=1|T=0,X)\n", - " f_11x, # f(M=1|T=1,X)\n", - " f_m0x, # f(M|T=0,X)\n", - " f_m1x, # f(M|T=1,X)\n", - " mu_t1, # E[Y|T=1,M,X]\n", - " mu_t0, # E[Y|T=0,M,X]\n", - " mu_t1_m1, # E[Y|T=1,M=1,X]\n", - " mu_t1_m0, # E[Y|T=1,M=0,X]\n", - " mu_t0_m1, # E[Y|T=0,M=1,X]\n", - " mu_t0_m0, # E[Y|T=0,M=0,X]\n", - " E_mu_t0_t0, # E[E[Y|T=0,M,X]|T=0,X]\n", - " E_mu_t0_t1, # E[E[Y|T=0,M,X]|T=1,X]\n", - " E_mu_t1_t0, # E[E[Y|T=1,M,X]|T=0,X]\n", - " E_mu_t1_t1, # E[E[Y|T=1,M,X]|T=1,X]\n", - " ) = [np.zeros(n) for _ in range(17)]\n", - " t0, m0 = np.zeros((n, 1)), np.zeros((n, 1))\n", - " t1, m1 = np.ones((n, 1)), np.ones((n, 1))\n", - " n_discarded = 0\n", - "\n", - " if regularization:\n", - " alphas, cs = ALPHAS, ALPHAS\n", - " else:\n", - " alphas, cs = [0.0], [np.inf]\n", - "\n", - " train_test_list = get_train_test_lists(crossfit, n)\n", - "\n", - " # Cross-fitting loop\n", - " for train_index, test_index in train_test_list:\n", - " # Index declaration\n", - " test_ind = np.arange(len(test_index))\n", - " ind_t0 = t[test_index] == 0\n", - "\n", - " y_reg, m_prob, p_x_clf = get_regressions(regularization, interaction, forest, calibration)\n", - " y_reg.fit(\n", - " get_interactions(interaction, x, t, m)[train_index, :], y[train_index]\n", - " )\n", - " m_prob.fit(\n", - " get_interactions(interaction, t, x)[train_index, :], m[train_index]\n", - " )\n", - " p_x_clf.fit(\n", - " x[train_index, :], t[train_index]\n", - " )\n", - "\n", - " # predict P(T=1|X)\n", - " p_x[test_index] = p_x_clf.predict_proba(x[test_index, :])[:, 1]\n", - "\n", - " # predict f(M=m|T=t,X)\n", - " res = m_prob.predict_proba(get_interactions(interaction, t0, x)[test_index, :])\n", - " f_00x[test_index] = res[:, 0]\n", - " f_01x[test_index] = res[:, 1]\n", - " res = m_prob.predict_proba(get_interactions(interaction, t1, x)[test_index, :])\n", - " f_10x[test_index] = res[:, 0]\n", - " f_11x[test_index] = res[:, 1]\n", - "\n", - " # predict f(M|T=t,X)\n", - " f_m0x[test_index] = m_prob.predict_proba(\n", - " get_interactions(interaction, t0, x)[test_index, :]\n", - " )[test_ind, m[test_index]]\n", - " f_m1x[test_index] = m_prob.predict_proba(\n", - " get_interactions(interaction, t1, x)[test_index, :]\n", - " )[test_ind, m[test_index]]\n", - "\n", - " # predict E[Y|T=t,M,X]\n", - " mu_t1[test_index] = y_reg.predict(\n", - " get_interactions(interaction, x, t1, m)[test_index, :]\n", - " )\n", - " mu_t0[test_index] = y_reg.predict(\n", - " get_interactions(interaction, x, t0, m)[test_index, :]\n", - " )\n", - "\n", - " # predict E[Y|T=t,M=m,X]\n", - " mu_t0_m0[test_index] = y_reg.predict(\n", - " get_interactions(interaction, x, t0, m0)[test_index, :]\n", - " )\n", - " mu_t0_m1[test_index] = y_reg.predict(\n", - " get_interactions(interaction, x, t0, m1)[test_index, :]\n", - " )\n", - " mu_t1_m1[test_index] = y_reg.predict(\n", - " get_interactions(interaction, x, t1, m1)[test_index, :]\n", - " )\n", - " mu_t1_m0[test_index] = y_reg.predict(\n", - " get_interactions(interaction, x, t1, m0)[test_index, :]\n", - " )\n", - "\n", - " # E[E[Y|T=1,M=m,X]|T=t,X] model fitting\n", - " reg_y_t1m1_t0 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][ind_t0, :], mu_t1_m1[test_index][ind_t0]\n", - " )\n", - " reg_y_t1m0_t0 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][ind_t0, :], mu_t1_m0[test_index][ind_t0]\n", - " )\n", - " reg_y_t1m1_t1 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][~ind_t0, :], mu_t1_m1[test_index][~ind_t0]\n", - " )\n", - " reg_y_t1m0_t1 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][~ind_t0, :], mu_t1_m0[test_index][~ind_t0]\n", - " )\n", - "\n", - " # predict E[E[Y|T=1,M=m,X]|T=t,X]\n", - " E_mu_t1_t0[test_index] = (\n", - " reg_y_t1m0_t0.predict(x[test_index, :]) * f_00x[test_index]\n", - " + reg_y_t1m1_t0.predict(x[test_index, :]) * f_01x[test_index]\n", - " )\n", - " E_mu_t1_t1[test_index] = (\n", - " reg_y_t1m0_t1.predict(x[test_index, :]) * f_10x[test_index]\n", - " + reg_y_t1m1_t1.predict(x[test_index, :]) * f_11x[test_index]\n", - " )\n", - "\n", - " # E[E[Y|T=0,M=m,X]|T=t,X] model fitting\n", - " reg_y_t0m1_t0 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][ind_t0, :], mu_t0_m1[test_index][ind_t0]\n", - " )\n", - " reg_y_t0m0_t0 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][ind_t0, :], mu_t0_m0[test_index][ind_t0]\n", - " )\n", - " reg_y_t0m1_t1 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][~ind_t0, :], mu_t0_m1[test_index][~ind_t0]\n", - " )\n", - " reg_y_t0m0_t1 = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit(\n", - " x[test_index, :][~ind_t0, :], mu_t0_m0[test_index][~ind_t0]\n", - " )\n", - "\n", - " # predict E[E[Y|T=0,M=m,X]|T=t,X]\n", - " E_mu_t0_t0[test_index] = (\n", - " reg_y_t0m0_t0.predict(x[test_index, :]) * f_00x[test_index]\n", - " + reg_y_t0m1_t0.predict(x[test_index, :]) * f_01x[test_index]\n", - " )\n", - " E_mu_t0_t1[test_index] = (\n", - " reg_y_t0m0_t1.predict(x[test_index, :]) * f_10x[test_index]\n", - " + reg_y_t0m1_t1.predict(x[test_index, :]) * f_11x[test_index]\n", - " )\n", - "\n", - " # trimming\n", - " p_x_trim = p_x != np.clip(p_x, trim, 1 - trim)\n", - " f_m0x_trim = f_m0x != np.clip(f_m0x, trim, 1 - trim)\n", - " f_m1x_trim = f_m1x != np.clip(f_m1x, trim, 1 - trim)\n", - " trimmed = p_x_trim + f_m0x_trim + f_m1x_trim\n", - "\n", - " var_name = [\"t\", \"y\", \"p_x\", \"f_m0x\", \"f_m1x\", \"mu_t1\", \"mu_t0\"]\n", - " var_name += [\"E_mu_t1_t1\", \"E_mu_t0_t0\", \"E_mu_t1_t0\", \"E_mu_t0_t1\"]\n", - "\n", - " for var in var_name:\n", - " exec(f\"{var} = {var}[~trimmed]\")\n", - " n_discarded += np.sum(trimmed)\n", - "\n", - " # ytmt computing\n", - " y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1\n", - " y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0\n", - " y1m0 = (\n", - " (t / p_x) * (f_m0x / f_m1x) * (y - mu_t1)\n", - " + (1 - t) / (1 - p_x) * (mu_t1 - E_mu_t1_t0)\n", - " + E_mu_t1_t0\n", - " )\n", - " y0m1 = (\n", - " (1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_t0)\n", - " + t / p_x * (mu_t0 - E_mu_t0_t1)\n", - " + E_mu_t0_t1\n", - " )\n", - "\n", - " # effects computing\n", - " total = np.mean(y1m1 - y0m0)\n", - " direct1 = np.mean(y1m1 - y0m1)\n", - " direct0 = np.mean(y1m0 - y0m0)\n", - " indirect1 = np.mean(y1m1 - y1m0)\n", - " indirect0 = np.mean(y0m1 - y0m0)\n", - "\n", - " return total, direct1, direct0, indirect1, indirect0, n_discarded\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8baf76a4", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 128, - "id": "a7bb5072", - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "m is not binary", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0meffects_MR\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmultiply_robust_efficient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36mmultiply_robust_efficient\u001b[0;34m(y, t, m, x, interaction, forest, crossfit, trim, regularization, calibration, calib_method)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0mdim_m\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mdim_m\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"m is not binary\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: m is not binary" - ] - } - ], - "source": [ - "effects_MR = multiply_robust_efficient(y, t, m, x)" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "id": "2dabfd33", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 133, - "id": "f67c3f8f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 134, - "id": "03bed34d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Linear coefficients\n", - "Direct effects\n", - "True theta1:[0.2], estimated theta1: 0.195\n", - "True theta0:[0.2], estimated theta0: 0.195\n", - "\\\n", - "Indirect effects\n", - "True delta1:[0.81], estimated delta1: 0.818\n", - "True delta0:[0.81], estimated delta0: 0.818\n", - "\\\n", - "Total effect\n", - "True tau:[1.01], estimated tau: 1.013\n" - ] - } - ], - "source": [ - "effects_linear = ols_mediation(y, t, m, x)\n", - "\n", - "tau_hat, theta_1_hat, theta_0_hat, delta_1_hat, delta_0_hat, n_non_trimmed = effects_linear\n", - "print(\"Linear coefficients\")\n", - "print(\"Direct effects\")\n", - "print(\"True theta1:{}, estimated theta1: {}\".format(theta_1, round(theta_1_hat,3)))\n", - "print(\"True theta0:{}, estimated theta0: {}\".format(theta_0, round(theta_0_hat,3)))\n", - "print(\"\\\\\")\n", - "print(\"Indirect effects\")\n", - "print(\"True delta1:{}, estimated delta1: {}\".format(delta_1, round(delta_1_hat,3)))\n", - "print(\"True delta0:{}, estimated delta0: {}\".format(delta_0, round(delta_0_hat,3)))\n", - "print(\"\\\\\")\n", - "print(\"Total effect\")\n", - "print(\"True tau:{}, estimated tau: {}\".format(tau, round(tau_hat,3)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c0c2b68", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/setup.py b/setup.py index 618e661..b96338e 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ 'pandas>=1.2.1', 'scikit-learn>=0.22.1', 'numpy>=1.19.2', - 'rpy2>=2.9.4', 'scipy>=1.5.2', 'seaborn>=0.11.1', 'matplotlib>=3.3.2', diff --git a/src/med_bench/estimation/base.py b/src/med_bench/estimation/base.py new file mode 100644 index 0000000..b12ba6d --- /dev/null +++ b/src/med_bench/estimation/base.py @@ -0,0 +1,345 @@ +from abc import ABCMeta, abstractmethod +import numpy as np +from sklearn import clone + +from med_bench.utils.decorators import fitted + + +class Estimator: + """General abstract class for causal mediation Estimator""" + + __metaclass__ = ABCMeta + + def __init__(self, verbose: bool = True, crossfit: int = 0): + """Initializes Estimator base class + + Parameters + ---------- + verbose : bool + will print some logs if True + crossfit : int + number of crossfit folds, if 0 no crossfit is performed + """ + self._crossfit = crossfit + self._crossfit_check() + + self._verbose = verbose + + self._fitted = False + + @property + def verbose(self): + return self._verbose + + def _crossfit_check(self): + """Checks if the estimator inputs are valid""" + if self._crossfit > 0: + raise NotImplementedError( + """Crossfit is not implemented yet + You should perform something like this on your side : + cf_iterator = KFold(k=5) + for data_train, data_test in cf_iterator: + result.append(DML(...., cross_fitting=False) + .fit(train_data.X, train_data.t, train_data.m, train_data.y)\ + .estimate(test_data.X, test_data.t, test_data.m, test_data.y)) + np.mean(result)""" + ) + + @abstractmethod + def fit(self, t, m, x, y): + """Fits nuisance parameters to data + + Parameters + ---------- + t array-like, shape (n_samples) + treatment value for each unit, binary + + m array-like, shape (n_samples) + mediator value for each unit, here m is necessary binary and uni- + dimensional + + x array-like, shape (n_samples, n_features_covariates) + covariates (potential confounders) values + + y array-like, shape (n_samples) + outcome value for each unit, continuous + + """ + pass + + @abstractmethod + @fitted + def estimate(self, t, m, x, y): + """Estimates causal effect on data + + Parameters + ---------- + t array-like, shape (n_samples) + treatment value for each unit, binary + + m array-like, shape (n_samples) + mediator value for each unit, here m is necessary binary and uni- + dimensional + + x array-like, shape (n_samples, n_features_covariates) + covariates (potential confounders) values + + y array-like, shape (n_samples) + outcome value for each unit, continuous + + nuisances + """ + pass + + def _resize(self, t, m, x, y): + """Resize data for the right shape + + Parameters + ---------- + t array-like, shape (n_samples) + treatment value for each unit, binary + + m array-like, shape (n_samples) + mediator value for each unit, here m is necessary binary and uni- + dimensional + + x array-like, shape (n_samples, n_features_covariates) + covariates (potential confounders) values + + y array-like, shape (n_samples) + outcome value for each unit, continuous + """ + if len(y) != len(y.ravel()): + raise ValueError("Multidimensional y is not supported") + if len(t) != len(t.ravel()): + raise ValueError("Multidimensional t is not supported") + + n = len(y) + if len(x.shape) == 1: + x.reshape(n, 1) + if len(m.shape) == 1: + m = m.reshape(n, 1) + + if n != len(x) or n != len(m) or n != len(t): + raise ValueError("Inputs don't have the same number of observations") + + y = y.ravel() + t = t.ravel() + + return t, m, x, y + + def _fit_treatment_propensity_x(self, t, x): + """Fits the nuisance parameter for the propensity P(T=1|X)""" + self._classifier_t_x = clone(self.classifier).fit(x, t) + + return self + + def _fit_treatment_propensity_xm(self, t, m, x): + """Fits the nuisance parameter for the propensity P(T=1|X, M)""" + xm = np.hstack((x, m)) + self._classifier_t_xm = clone(self.classifier).fit(xm, t) + + return self + + def _fit_binary_mediator_probability(self, t, m, x): + """Fits the nuisance parameter for the density f(M=m|T, X)""" + # estimate mediator densities + t_x = np.hstack([t.reshape(-1, 1), x]) + + # Fit classifier + self._classifier_m = clone(self.classifier).fit(t_x, m.ravel()) + + return self + + def _fit_conditional_mean_outcome(self, t, m, x, y): + """Fits the nuisance for the conditional mean outcome for the density f(M=m|T, X)""" + x_t_m = np.hstack([x, t.reshape(-1, 1), m]) + self._regressor_y = clone(self.regressor).fit(x_t_m, y) + + return self + + def _fit_cross_conditional_mean_outcome(self, t, m, x, y): + """Fits the cross conditional mean outcome E[E[Y|T=t,M,X]|T=t',X]""" + + xm = np.hstack((x, m)) + + n = t.shape[0] + train = np.arange(n) + ( + mu_1mx_nested, # E[Y|T=1,M,X] predicted on train_nested set + mu_0mx_nested, # E[Y|T=0,M,X] predicted on train_nested set + ) = [np.zeros(n) for _ in range(2)] + + train1 = train[t[train] == 1] + train0 = train[t[train] == 0] + + train_mean, train_nested = np.array_split(train, 2) + train_mean1 = train_mean[t[train_mean] == 1] + train_mean0 = train_mean[t[train_mean] == 0] + train_nested1 = train_nested[t[train_nested] == 1] + train_nested0 = train_nested[t[train_nested] == 0] + + self.regressors = {} + + # predict E[Y|T=1,M,X] + self.regressors["y_t1_mx"] = clone(self.regressor) + self.regressors["y_t1_mx"].fit(xm[train_mean1], y[train_mean1]) + mu_1mx_nested[train_nested] = self.regressors["y_t1_mx"].predict( + xm[train_nested] + ) + + # predict E[Y|T=0,M,X] + self.regressors["y_t0_mx"] = clone(self.regressor) + self.regressors["y_t0_mx"].fit(xm[train_mean0], y[train_mean0]) + mu_0mx_nested[train_nested] = self.regressors["y_t0_mx"].predict( + xm[train_nested] + ) + + # predict E[E[Y|T=1,M,X]|T=0,X] + self.regressors["y_t1_x_t0"] = clone(self.regressor) + self.regressors["y_t1_x_t0"].fit(x[train_nested0], mu_1mx_nested[train_nested0]) + + # predict E[E[Y|T=0,M,X]|T=1,X] + self.regressors["y_t0_x_t1"] = clone(self.regressor) + self.regressors["y_t0_x_t1"].fit(x[train_nested1], mu_0mx_nested[train_nested1]) + + # predict E[Y|T=1,X] + self.regressors["y_t1_x"] = clone(self.regressor) + self.regressors["y_t1_x"].fit(x[train1], y[train1]) + + # predict E[Y|T=0,X] + self.regressors["y_t0_x"] = clone(self.regressor) + self.regressors["y_t0_x"].fit(x[train0], y[train0]) + + return self + + def _estimate_binary_mediator_probability(self, x, m): + """ + Estimate mediator density P(M=m|T,X) for a binary M + + Returns + ------- + f_m0x, array-like, shape (n_samples) + probabilities f(M|T=0,X) + f_m1x, array-like, shape (n_samples) + probabilities f(M|T=1,X) + """ + n = x.shape[0] + + t0 = np.zeros((n, 1)) + t1 = np.ones((n, 1)) + + m = m.ravel() + + t0_x = np.hstack([t0.reshape(-1, 1), x]) + t1_x = np.hstack([t1.reshape(-1, 1), x]) + + f_m0x = self._classifier_m.predict_proba(t0_x)[np.arange(m.shape[0]), m] + f_m1x = self._classifier_m.predict_proba(t1_x)[np.arange(m.shape[0]), m] + + return f_m0x, f_m1x + + def _estimate_binary_mediator_probability_table(self, x): + """ + Estimate mediator density f(M|T,X) + + Returns + ------- + f_00x: array-like, shape (n_samples) + probabilities f(M=0|T=0,X) + f_01x, array-like, shape (n_samples) + probabilities f(M=0|T=1,X) + f_10x, array-like, shape (n_samples) + probabilities f(M=1|T=0,X) + f_11x, array-like, shape (n_samples) + probabilities f(M=1|T=1,X) + """ + n = x.shape[0] + + t0 = np.zeros((n, 1)) + t1 = np.ones((n, 1)) + + t0_x = np.hstack([t0.reshape(-1, 1), x]) + t1_x = np.hstack([t1.reshape(-1, 1), x]) + + # predict f(M=m|T=t,X) + fm_0 = self._classifier_m.predict_proba(t0_x) + f_00x = fm_0[:, 0] + f_01x = fm_0[:, 1] + fm_1 = self._classifier_m.predict_proba(t1_x) + f_10x = fm_1[:, 0] + f_11x = fm_1[:, 1] + + return f_00x, f_01x, f_10x, f_11x + + def _estimate_treatment_propensity_x(self, x): + """ + Estimate treatment propensity P(T=1|X) + + Returns + ------- + p_x : array-like, shape (n_samples) + probabilities P(T=1|X) + """ + p_x = self._classifier_t_x.predict_proba(x)[:, 1] + + return p_x + + def _estimate_treatment_propensity_xm(self, m, x): + """ + Estimate treatment probabilities P(T=1|X) and P(T=1|X, M) with train + + Returns + ------- + p_x : array-like, shape (n_samples) + probabilities P(T=1|X) + p_xm : array-like, shape (n_samples) + probabilities P(T=1|X, M) + """ + xm = np.hstack((x, m)) + + p_xm = self._classifier_t_xm.predict_proba(xm)[:, 1] + + return p_xm + + def _estimate_cross_conditional_mean_outcome(self, m, x): + """ + Estimate the conditional mean outcome, + the cross conditional mean outcome + + Returns + ------- + mu_m0x, array-like, shape (n_samples) + conditional mean outcome estimates E[Y|T=0,M,X] + mu_m1x, array-like, shape (n_samples) + conditional mean outcome estimates E[Y|T=1,M,X] + mu_0x, array-like, shape (n_samples) + cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=0,X] + E_mu_t0_t1, array-like, shape (n_samples) + cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=1,X] + E_mu_t1_t0, array-like, shape (n_samples) + cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=0,X] + mu_1x, array-like, shape (n_samples) + cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=1,X] + """ + xm = np.hstack((x, m)) + + # predict E[Y|T=1,M,X] + mu_1mx = self.regressors["y_t1_mx"].predict(xm) + + # predict E[Y|T=0,M,X] + mu_0mx = self.regressors["y_t0_mx"].predict(xm) + + # predict E[E[Y|T=1,M,X]|T=0,X] + E_mu_t1_t0 = self.regressors["y_t1_x_t0"].predict(x) + + # predict E[E[Y|T=0,M,X]|T=1,X] + E_mu_t0_t1 = self.regressors["y_t0_x_t1"].predict(x) + + # predict E[Y|T=1,X] + mu_1x = self.regressors["y_t1_x"].predict(x) + + # predict E[Y|T=0,X] + mu_0x = self.regressors["y_t0_x"].predict(x) + + return mu_0mx, mu_1mx, mu_0x, E_mu_t0_t1, E_mu_t1_t0, mu_1x diff --git a/src/med_bench/estimation/mediation_coefficient_product.py b/src/med_bench/estimation/mediation_coefficient_product.py new file mode 100644 index 0000000..cca56cd --- /dev/null +++ b/src/med_bench/estimation/mediation_coefficient_product.py @@ -0,0 +1,82 @@ +import numpy as np +from sklearn.linear_model import RidgeCV + +from med_bench.estimation.base import Estimator +from med_bench.utils.constants import ALPHAS, CV_FOLDS, TINY +from med_bench.utils.decorators import fitted + + +class CoefficientProduct(Estimator): + """Coefficient Product estimatation method class""" + + def __init__(self, regularize: bool, **kwargs): + """Initializes Coefficient product estimatation method + + Parameters + ---------- + regularize (bool) : regularization parameter + """ + super().__init__(**kwargs) + + self._regularize = regularize + + def fit(self, t, m, x, y): + """Fits nuisance parameters to data + + Parameters + ---------- + t array-like, shape (n_samples) + treatment value for each unit, binary + + m array-like, shape (n_samples) + mediator value for each unit, here m is necessary binary and uni- + dimensional + + x array-like, shape (n_samples, n_features_covariates) + covariates (potential confounders) values + + y array-like, shape (n_samples) + outcome value for each unit, continuous + + """ + if self._regularize: + alphas = ALPHAS + else: + alphas = [TINY] + + t, m, x, y = self._resize(t, m, x, y) + self._coef_t_m = np.zeros(m.shape[1]) + for i in range(m.shape[1]): + m_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit( + np.hstack((x, t.reshape(-1, 1))), m[:, i] + ) + self._coef_t_m[i] = m_reg.coef_[-1] + y_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit( + np.hstack((x, t.reshape(-1, 1), m)), y + ) + + self._coef_y = y_reg.coef_ + + self._fitted = True + + if self.verbose: + print("Nuisance models fitted") + + @fitted + def estimate(self, t, m, x, y): + """Estimates causal effect on data""" + direct_effect_treated = self._coef_y[x.shape[1]] + direct_effect_control = direct_effect_treated + indirect_effect_treated = sum( + self._coef_y[x.shape[1] + 1 :] * self._coef_t_m + ) + indirect_effect_control = indirect_effect_treated + + causal_effects = { + "total_effect": direct_effect_treated + indirect_effect_control, + "direct_effect_treated": direct_effect_treated, + "direct_effect_control": direct_effect_control, + "indirect_effect_treated": indirect_effect_treated, + "indirect_effect_control": indirect_effect_control, + } + return causal_effects diff --git a/src/med_bench/estimation/mediation_dml.py b/src/med_bench/estimation/mediation_dml.py new file mode 100644 index 0000000..0e573f5 --- /dev/null +++ b/src/med_bench/estimation/mediation_dml.py @@ -0,0 +1,122 @@ +import numpy as np + +from med_bench.estimation.base import Estimator + + +class DoubleMachineLearning(Estimator): + """Double Machine Learning estimation method class""" + + def __init__(self, regressor, classifier, normalized: bool, **kwargs): + """Initializes Double Machine Learning estimation method + + Parameters + ---------- + regressor + Regressor used for mu estimation, can be any object with a fit and predict method + classifier + Classifier used for propensity estimation, can be any object with a fit and predict_proba method + normalized : bool + Whether to normalize the propensity scores + """ + super().__init__(**kwargs) + + assert hasattr( + regressor, "fit" + ), "The model does not have a 'fit' method." + assert hasattr( + regressor, "predict" + ), "The model does not have a 'predict' method." + assert hasattr( + classifier, "fit" + ), "The model does not have a 'fit' method." + assert hasattr( + classifier, "predict_proba" + ), "The model does not have a 'predict_proba' method." + self.regressor = regressor + self.classifier = classifier + + self._normalized = normalized + + def fit(self, t, m, x, y): + """Fits nuisance parameters to data""" + t, m, x, y = self._resize(t, m, x, y) + + self._fit_treatment_propensity_x(t, x) + self._fit_treatment_propensity_xm(t, m, x) + self._fit_cross_conditional_mean_outcome(t, m, x, y) + self._fitted = True + + if self.verbose: + print("Nuisance models fitted") + return self + + def estimate(self, t, m, x, y): + """Estimates causal effect on data""" + t, m, x, y = self._resize(t, m, x, y) + + p_x = self._estimate_treatment_propensity_x(x) + p_xm = self._estimate_treatment_propensity_xm(m, x) + + mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = ( + self._estimate_cross_conditional_mean_outcome(m, x) + ) + + # score computing + if self._normalized: + sum_score_m1 = np.mean(t / p_x) + sum_score_m0 = np.mean((1 - t) / (1 - p_x)) + sum_score_t1m0 = np.mean(t * (1 - p_xm) / (p_xm * (1 - p_x))) + sum_score_t0m1 = np.mean((1 - t) * p_xm / ((1 - p_xm) * p_x)) + y1m1 = (t / p_x * (y - E_mu_t1_t1)) / sum_score_m1 + E_mu_t1_t1 + y0m0 = ( + (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + ) / sum_score_m0 + E_mu_t0_t0 + y1m0 = ( + (t * (1 - p_xm) / (p_xm * (1 - p_x)) * (y - mu_1mx)) + / sum_score_t1m0 + + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) / sum_score_m0 + + E_mu_t1_t0 + ) + y0m1 = ( + ((1 - t) * p_xm / ((1 - p_xm) * p_x) * (y - mu_0mx)) + / sum_score_t0m1 + + (t / p_x * (mu_0mx - E_mu_t0_t1)) / sum_score_m1 + + E_mu_t0_t1 + ) + else: + y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1 + y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0 + y1m0 = ( + t * (1 - p_xm) / (p_xm * (1 - p_x)) * (y - mu_1mx) + + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) + + E_mu_t1_t0 + ) + y0m1 = ( + (1 - t) * p_xm / ((1 - p_xm) * p_x) * (y - mu_0mx) + + t / p_x * (mu_0mx - E_mu_t0_t1) + + E_mu_t0_t1 + ) + + # mean score computing + eta_t1t1 = np.mean(y1m1) + eta_t0t0 = np.mean(y0m0) + eta_t1t0 = np.mean(y1m0) + eta_t0t1 = np.mean(y0m1) + + # effects computing + total_effect = eta_t1t1 - eta_t0t0 + + direct_effect_treated = eta_t1t1 - eta_t0t1 + direct_effect_control = eta_t1t0 - eta_t0t0 + indirect_effect_treated = eta_t1t1 - eta_t1t0 + indirect_effect_control = eta_t0t1 - eta_t0t0 + + causal_effects = { + "total_effect": total_effect, + "direct_effect_treated": direct_effect_treated, + "direct_effect_control": direct_effect_control, + "indirect_effect_treated": indirect_effect_treated, + "indirect_effect_control": indirect_effect_control, + } + + return causal_effects diff --git a/src/med_bench/estimation/mediation_g_computation.py b/src/med_bench/estimation/mediation_g_computation.py new file mode 100644 index 0000000..5c5085a --- /dev/null +++ b/src/med_bench/estimation/mediation_g_computation.py @@ -0,0 +1,147 @@ +import numpy as np + +from med_bench.estimation.base import Estimator +from med_bench.utils.decorators import fitted +from med_bench.utils.utils import is_array_binary + + +class GComputation(Estimator): + """GComputation estimation method class""" + + def __init__(self, regressor, classifier, **kwargs): + """Initializes GComputation estimation method + + Parameters + ---------- + regressor + Regressor used for mu estimation, can be any object with a fit and predict method + classifier + Classifier used for propensity estimation, can be any object with a fit and predict_proba method + """ + super().__init__(**kwargs) + + assert hasattr( + regressor, "fit" + ), "The model does not have a 'fit' method." + assert hasattr( + regressor, "predict" + ), "The model does not have a 'predict' method." + assert hasattr( + classifier, "fit" + ), "The model does not have a 'fit' method." + assert hasattr( + classifier, "predict_proba" + ), "The model does not have a 'predict_proba' method." + self.regressor = regressor + self.classifier = classifier + + def _estimate_conditional_mean_outcome_table(self, x): + """ + Estimate conditional mean outcome E[Y|T,M,X] + + Returns + ------- + mu_00x: array-like, shape (n_samples) + conditional mean outcome estimates E[Y|T=0,M=0,X] + mu_01x, array-like, shape (n_samples) + conditional mean outcome estimates E[Y|T=0,M=1,X] + mu_10x, array-like, shape (n_samples) + conditional mean outcome estimates E[Y|T=1,M=0,X] + mu_11x, array-like, shape (n_samples) + conditional mean outcome estimates E[Y|T=1,M=1,X] + """ + n = x.shape[0] + + t0 = np.zeros((n, 1)) + t1 = np.ones((n, 1)) + m0 = np.zeros((n, 1)) + m1 = np.ones((n, 1)) + + x_t1_m1 = np.hstack([x, t1.reshape(-1, 1), m1]) + x_t1_m0 = np.hstack([x, t1.reshape(-1, 1), m0]) + x_t0_m1 = np.hstack([x, t0.reshape(-1, 1), m1]) + x_t0_m0 = np.hstack([x, t0.reshape(-1, 1), m0]) + + mu_00x = self._regressor_y.predict(x_t0_m0) + mu_01x = self._regressor_y.predict(x_t0_m1) + mu_10x = self._regressor_y.predict(x_t1_m0) + mu_11x = self._regressor_y.predict(x_t1_m1) + + return mu_00x, mu_01x, mu_10x, mu_11x + + def fit(self, t, m, x, y): + """Fits nuisance parameters to data""" + t, m, x, y = self._resize(t, m, x, y) + + if is_array_binary(m): + self._fit_binary_mediator_probability(t, m, x) + self._fit_conditional_mean_outcome(t, m, x, y) + else: + self._fit_cross_conditional_mean_outcome(t, m, x, y) + + self._fitted = True + + if self.verbose: + print("Nuisance models fitted") + + return self + + @fitted + def estimate(self, t, m, x, y): + """Estimates causal effect on data""" + t, m, x, y = self._resize(t, m, x, y) + + if is_array_binary(m): + f_00x, f_01x, f_10x, f_11x = ( + self._estimate_binary_mediator_probability_table(x) + ) + mu_00x, mu_01x, mu_10x, mu_11x = ( + self._estimate_conditional_mean_outcome_table(x) + ) + + direct_effect_i1 = mu_11x - mu_01x + direct_effect_i0 = mu_10x - mu_00x + n = len(y) + direct_effect_treated = ( + direct_effect_i1 * f_11x + direct_effect_i0 * f_10x + ).sum() / n + direct_effect_control = ( + direct_effect_i1 * f_01x + direct_effect_i0 * f_00x + ).sum() / n + indirect_effect_i1 = f_11x - f_01x + indirect_effect_i0 = f_10x - f_00x + indirect_effect_treated = ( + indirect_effect_i1 * mu_11x + indirect_effect_i0 * mu_10x + ).sum() / n + indirect_effect_control = ( + indirect_effect_i1 * mu_01x + indirect_effect_i0 * mu_00x + ).sum() / n + total_effect = direct_effect_control + indirect_effect_treated + else: + + (mu_0mx, mu_1mx, y0m0, y0m1, y1m0, y1m1) = ( + self._estimate_cross_conditional_mean_outcome(m, x) + ) + # mean score computing + eta_t1t1 = np.mean(y1m1) + eta_t0t0 = np.mean(y0m0) + eta_t1t0 = np.mean(y1m0) + eta_t0t1 = np.mean(y0m1) + + # effects computing + total_effect = eta_t1t1 - eta_t0t0 + + direct_effect_treated = eta_t1t1 - eta_t0t1 + direct_effect_control = eta_t1t0 - eta_t0t0 + indirect_effect_treated = eta_t1t1 - eta_t1t0 + indirect_effect_control = eta_t0t1 - eta_t0t0 + + causal_effects = { + "total_effect": total_effect, + "direct_effect_treated": direct_effect_treated, + "direct_effect_control": direct_effect_control, + "indirect_effect_treated": indirect_effect_treated, + "indirect_effect_control": indirect_effect_control, + } + + return causal_effects diff --git a/src/med_bench/estimation/mediation_ipw.py b/src/med_bench/estimation/mediation_ipw.py new file mode 100644 index 0000000..c9ac248 --- /dev/null +++ b/src/med_bench/estimation/mediation_ipw.py @@ -0,0 +1,90 @@ +import numpy as np + +from med_bench.estimation.base import Estimator +from med_bench.utils.decorators import fitted + + +class InversePropensityWeighting(Estimator): + """Inverse propensity weighting estimation method class""" + + def __init__(self, classifier, clip: float, trim: float, **kwargs): + """Initializes Inverse propensity weighting estimation method + + Parameters + ---------- + classifier + Classifier used for propensity estimation, can be any object with a fit and predict_proba method + clips : float + Clipping value for propensity scores + trim : float + Trimming value for propensity scores + """ + super().__init__(**kwargs) + + assert hasattr( + classifier, "fit" + ), "The model does not have a 'fit' method." + assert hasattr( + classifier, "predict_proba" + ), "The model does not have a 'predict_proba' method." + self.classifier = classifier + + self._clip = clip + self._trim = trim + + def fit(self, t, m, x, y): + """Fits nuisance parameters to data""" + + t, m, x, y = self._resize(t, m, x, y) + + self._fit_treatment_propensity_x(t, x) + self._fit_treatment_propensity_xm(t, m, x) + + self._fitted = True + + if self.verbose: + print("Nuisance models fitted") + + return self + + @fitted + def estimate(self, t, m, x, y): + """Estimates causal effect on data""" + + t, m, x, y = self._resize(t, m, x, y) + p_x = self._estimate_treatment_propensity_x(x) + p_xm = self._estimate_treatment_propensity_xm(m, x) + + ind = (p_xm > self._trim) & (p_xm < (1 - self._trim)) + y, t, p_x, p_xm = y[ind], t[ind], p_x[ind], p_xm[ind] + + # note on the names, ytmt' = Y(t, M(t')), the treatment needs to be + # binary but not the mediator + p_x = np.clip(p_x, self._clip, 1 - self._clip) + p_xm = np.clip(p_xm, self._clip, 1 - self._clip) + + # importance weighting + y1m1 = np.sum(y * t / p_x) / np.sum(t / p_x) + y1m0 = np.sum(y * t * (1 - p_xm) / (p_xm * (1 - p_x))) / np.sum( + t * (1 - p_xm) / (p_xm * (1 - p_x)) + ) + y0m0 = np.sum(y * (1 - t) / (1 - p_x)) / np.sum((1 - t) / (1 - p_x)) + y0m1 = np.sum(y * (1 - t) * p_xm / ((1 - p_xm) * p_x)) / np.sum( + (1 - t) * p_xm / ((1 - p_xm) * p_x) + ) + + total_effect = y1m1 - y0m0 + direct_effect_treated = y1m1 - y0m1 + direct_effect_control = y1m0 - y0m0 + indirect_effect_treated = y1m1 - y1m0 + indirect_effect_control = y0m1 - y0m0 + + causal_effects = { + "total_effect": total_effect, + "direct_effect_treated": direct_effect_treated, + "direct_effect_control": direct_effect_control, + "indirect_effect_treated": indirect_effect_treated, + "indirect_effect_control": indirect_effect_control, + } + + return causal_effects diff --git a/src/med_bench/estimation/mediation_mr.py b/src/med_bench/estimation/mediation_mr.py new file mode 100644 index 0000000..fd0d07f --- /dev/null +++ b/src/med_bench/estimation/mediation_mr.py @@ -0,0 +1,147 @@ +import numpy as np + +from med_bench.estimation.base import Estimator +from med_bench.utils.decorators import fitted +from med_bench.utils.utils import is_array_integer + + +class MultiplyRobust(Estimator): + """Iniitializes Multiply Robust estimatation method class""" + + def __init__(self, regressor, classifier, ratio: str, normalized, **kwargs): + """Initializes MulitplyRobust estimatation method + + Parameters + ---------- + regressor + Regressor used for mu estimation, can be any object with a fit and predict method + classifier + Classifier used for propensity estimation, can be any object with a fit and predict_proba method + ratio : str + Ratio to use for estimation, can be either 'density' or 'propensities' + normalized : bool + Whether to normalize the propensity scores + """ + super().__init__(**kwargs) + + assert hasattr( + regressor, "fit" + ), "The model does not have a 'fit' method." + assert hasattr( + regressor, "predict" + ), "The model does not have a 'predict' method." + assert hasattr( + classifier, "fit" + ), "The model does not have a 'fit' method." + assert hasattr( + classifier, "predict_proba" + ), "The model does not have a 'predict_proba' method." + self.regressor = regressor + self.classifier = classifier + + assert ratio in ["density", "propensities"] + self._ratio = ratio + self._normalized = normalized + + def fit(self, t, m, x, y): + """Fits nuisance parameters to data""" + t, m, x, y = self._resize(t, m, x, y) + + if self._ratio == "density" and is_array_integer(m): + self._fit_treatment_propensity_x(t, x) + self._fit_binary_mediator_probability(t, m, x) + + elif self._ratio == "propensities": + self._fit_treatment_propensity_x(t, x) + self._fit_treatment_propensity_xm(t, m, x) + + elif self._ratio == "density" and not is_array_integer(m): + raise NotImplementedError( + """Continuous mediator cannot use the density ratio method, + use a discrete mediator or set the ratio to 'propensities'""" + ) + + self._fit_cross_conditional_mean_outcome(t, m, x, y) + + self._fitted = True + + if self.verbose: + print("Nuisance models fitted") + + return self + + @fitted + def estimate(self, t, m, x, y): + """Estimates causal effect on data""" + # Format checking + t, m, x, y = self._resize(t, m, x, y) + + if self._ratio == "density": + f_m0x, f_m1x = self._estimate_binary_mediator_probability(x, m) + p_x = self._estimate_treatment_propensity_x(x) + ratio_t1_m0 = f_m0x / (p_x * f_m1x) + ratio_t0_m1 = f_m1x / ((1 - p_x) * f_m0x) + + elif self._ratio == "propensities": + p_x = self._estimate_treatment_propensity_x(x) + p_xm = self._estimate_treatment_propensity_xm(m, x) + ratio_t1_m0 = (1 - p_xm) / ((1 - p_x) * p_xm) + ratio_t0_m1 = p_xm / ((1 - p_xm) * p_x) + + mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = ( + self._estimate_cross_conditional_mean_outcome(m, x) + ) + + # score computing + if self._normalized: + sum_score_m1 = np.mean(t / p_x) + sum_score_m0 = np.mean((1 - t) / (1 - p_x)) + sum_score_t1m0 = np.mean(t * ratio_t1_m0) + sum_score_t0m1 = np.mean((1 - t) * ratio_t0_m1) + + y1m1 = (t / p_x * (y - E_mu_t1_t1)) / sum_score_m1 + E_mu_t1_t1 + y0m0 = ( + (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + ) / sum_score_m0 + E_mu_t0_t0 + + y1m0 = ( + (t * ratio_t1_m0 * (y - mu_1mx)) / sum_score_t1m0 + + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) / sum_score_m0 + + E_mu_t1_t0 + ) + + y0m1 = ( + ((1 - t) * ratio_t0_m1 * (y - mu_0mx)) / sum_score_t0m1 + + t / p_x * (mu_0mx - E_mu_t0_t1) / sum_score_m1 + + E_mu_t0_t1 + ) + else: + y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1 + y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0 + + y1m0 = ( + t * ratio_t1_m0 * (y - mu_1mx) + + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) + + E_mu_t1_t0 + ) + y0m1 = ( + (1 - t) * ratio_t0_m1 * (y - mu_0mx) + + t / p_x * (mu_0mx - E_mu_t0_t1) + + E_mu_t0_t1 + ) + + # effects computing + total = np.mean(y1m1 - y0m0) + direct1 = np.mean(y1m1 - y0m1) + direct0 = np.mean(y1m0 - y0m0) + indirect1 = np.mean(y1m1 - y1m0) + indirect0 = np.mean(y0m1 - y0m0) + + causal_effects = { + "total_effect": total, + "direct_effect_treated": direct1, + "direct_effect_control": direct0, + "indirect_effect_treated": indirect1, + "indirect_effect_control": indirect0, + } + return causal_effects diff --git a/src/med_bench/estimation/mediation_tmle.py b/src/med_bench/estimation/mediation_tmle.py new file mode 100644 index 0000000..71fc3e8 --- /dev/null +++ b/src/med_bench/estimation/mediation_tmle.py @@ -0,0 +1,241 @@ +import numpy as np +from sklearn.base import clone +from sklearn.linear_model import LinearRegression + +from med_bench.estimation.base import Estimator +from med_bench.utils.decorators import fitted +from med_bench.utils.utils import is_array_binary + +ALPHA = 10 + + +class TMLE(Estimator): + """Implementation of targeted maximum likelihood estimation method class""" + + def __init__(self, regressor, classifier, ratio, **kwargs): + """_summary_ + + Parameters + ---------- + regressor + Regressor used for mu estimation, can be any object with a fit and predict method + classifier + Classifier used for propensity estimation, can be any object with a fit and predict_proba method + ratio : str + Ratio to use for estimation, can be either 'density' or 'propensities' + """ + super().__init__(**kwargs) + + assert hasattr(regressor, "fit"), "The model does not have a 'fit' method." + assert hasattr( + regressor, "predict" + ), "The model does not have a 'predict' method." + assert hasattr(classifier, "fit"), "The model does not have a 'fit' method." + assert hasattr( + classifier, "predict_proba" + ), "The model does not have a 'predict_proba' method." + self.regressor = regressor + self.classifier = classifier + + assert ratio in ["density", "propensities"] + self._ratio = ratio + + def _one_step_correction_direct(self, t, m, x, y): + """Implements the one step correction for the estimation of the natural + direct effect with the ratio of mediator densities or treatment + propensities. + + """ + n = t.shape[0] + t, m, x, y = self._resize(t, m, x, y) + t0 = np.zeros((n)) + t1 = np.ones((n)) + + # estimate mediator densities + if self._ratio == "density": + f_m0x, f_m1x = self._estimate_binary_mediator_probability(x, m) + p_x = self._estimate_treatment_propensity_x(x) + ratio = f_m0x / (p_x * f_m1x) + + elif self._ratio == "propensities": + p_x = self._estimate_treatment_propensity_x(x) + p_xm = self._estimate_treatment_propensity_xm(m, x) + ratio = (1 - p_xm) / ((1 - p_x) * p_xm) + + # estimation of corrective features for the conditional mean outcome + h_corrector = t * ratio - (1 - t) / (1 - p_x) + + x_t_mr = np.hstack( + [var.reshape(-1, 1) if len(var.shape) == 1 else var for var in [x, t, m]] + ) + mu_tmx = self._regressor_y.predict(x_t_mr) + + # regress with OLS the error of conditional mean outcome regressor on + # corrective features + reg = LinearRegression(fit_intercept=False).fit( + h_corrector.reshape(-1, 1), (y - mu_tmx).squeeze() + ) + # corrective coefficient epsilon + epsilon_h = reg.coef_ + + x_t0_m = np.hstack( + [var.reshape(-1, 1) if len(var.shape) == 1 else var for var in [x, t0, m]] + ) + x_t1_m = np.hstack( + [var.reshape(-1, 1) if len(var.shape) == 1 else var for var in [x, t1, m]] + ) + + # one step corrected conditional mean outcomes + mu_t0_mx = self._regressor_y.predict(x_t0_m) + h_corrector_t0 = t0 * ratio - (1 - t0) / (1 - p_x) + mu_t1_mx = self._regressor_y.predict(x_t1_m) + h_corrector_t1 = t1 * ratio - (1 - t1) / (1 - p_x) + mu_t0_mx_star = mu_t0_mx + epsilon_h * h_corrector_t0 + mu_t1_mx_star = mu_t1_mx + epsilon_h * h_corrector_t1 + + # estimation of natural direct effect + reg_cross = clone(self.regressor) + reg_cross.fit( + x[t == 0], (mu_t1_mx_star[t == 0] - mu_t0_mx_star[t == 0]).squeeze() + ) + + theta_0 = reg_cross.predict(x) + + # one step correction of the natural direct effect + c_corrector = (1 - t) / (1 - p_x) + reg = LinearRegression(fit_intercept=False).fit( + c_corrector.reshape(-1, 1)[t == 0], + (mu_t1_mx_star[t == 0] - y[t == 0] - theta_0[t == 0]).squeeze(), + ) + epsilon_c = reg.coef_ + + theta_0_star = theta_0 + epsilon_c * c_corrector + theta_0_star = np.mean(theta_0_star) + + return theta_0_star + + def _one_step_correction_indirect(self, t, m, x, y): + """Implements the one step correction for the estimation of the natural + indirect effect with the ratio of mediator densities or treatment + propensities. + + """ + n = t.shape[0] + t, m, x, y = self._resize(t, m, x, y) + t0 = np.zeros((n)) + t1 = np.ones((n)) + + # estimate mediator densities + if self._ratio == "density": + f_m0x, f_m1x = self._estimate_binary_mediator_probability(x, m) + p_x = self._estimate_treatment_propensity_x(x) + ratio = f_m0x / (p_x * f_m1x) + + elif self._ratio == "propensities": + p_x = self._estimate_treatment_propensity_x(x) + p_xm = self._estimate_treatment_propensity_xm(m, x) + ratio = (1 - p_xm) / ((1 - p_x) * p_xm) + + # estimation of corrective features for the conditional mean outcome + h_corrector = t / p_x - t * ratio + + x_t_mr = np.hstack( + [var.reshape(-1, 1) if len(var.shape) == 1 else var for var in [x, t, m]] + ) + mu_tmx = self._regressor_y.predict(x_t_mr) + + # regress with OLS the error of conditional mean outcome regressor on + # corrective features + reg = LinearRegression(fit_intercept=False).fit( + h_corrector.reshape(-1, 1), (y - mu_tmx).squeeze() + ) + + # corrective coefficient epsilon + epsilon_h = reg.coef_ + + x_t1_m = np.hstack( + [var.reshape(-1, 1) if len(var.shape) == 1 else var for var in [x, t1, m]] + ) + + # one step corrected conditional mean outcomes + mu_t1_mx = self._regressor_y.predict(x_t1_m) + h_corrector_t1 = t1 / p_x - t1 * ratio + mu_t1_mx_star = mu_t1_mx + epsilon_h * h_corrector_t1 + + # cross conditional mean outcome control + reg_cross = clone(self.regressor) + reg_cross.fit(x[t == 0], mu_t1_mx_star[t == 0]) + omega_t0x = reg_cross.predict(x) + + # one step corrected cross conditional mean outcome for control + c_corrector_t0 = (2 * t0 - 1) / p_x[:, None] + reg = LinearRegression(fit_intercept=False).fit( + c_corrector_t0[t == 0], + (mu_t1_mx_star[t == 0] - omega_t0x[t == 0]).squeeze(), + ) + epsilon_c_t0 = reg.coef_ + omega_t0x_star = omega_t0x + epsilon_c_t0 * c_corrector_t0 + + # cross conditional mean outcome treated + reg_cross = clone(self.regressor) + reg_cross.fit(x[t == 1], y[t == 1]) + omega_t1x = reg_cross.predict(x) + + # one step corrected cross conditional mean outcome for treated + c_corrector_t1 = (2 * t1 - 1) / p_x[:, None] + reg = LinearRegression(fit_intercept=False).fit( + c_corrector_t1[t == 1], (y[t == 1] - omega_t1x[t == 1]).squeeze() + ) + epsilon_c_t1 = reg.coef_ + omega_t1x_star = omega_t1x + epsilon_c_t1 * c_corrector_t1 + + # natural indirect effect + delta_1 = np.mean(omega_t1x_star - omega_t0x_star) + + return delta_1 + + def fit(self, t, m, x, y): + """Fits nuisance parameters to data""" + # bucketize if needed + t, m, x, y = self._resize(t, m, x, y) + + if (not is_array_binary(m)) and (self._ratio == "density"): + raise ValueError( + "The option mediator 'density' in TMLE is supported only for 1D binary mediator" + ) + + self._fit_treatment_propensity_x(t, x) + self._fit_conditional_mean_outcome(t, m, x, y) + + if self._ratio == "density": + self._fit_binary_mediator_probability(t, m, x) + + elif self._ratio == "propensities": + self._fit_treatment_propensity_xm(t, m, x) + + self._fitted = True + + if self.verbose: + print("Nuisance models fitted") + + return self + + @fitted + def estimate(self, t, m, x, y): + """Estimates causal effect on data""" + theta_0 = self._one_step_correction_direct(t, m, x, y) + delta_1 = self._one_step_correction_indirect(t, m, x, y) + total_effect = theta_0 + delta_1 + direct_effect_treated = None + direct_effect_control = theta_0 + indirect_effect_treated = delta_1 + indirect_effect_control = None + + causal_effects = { + "total_effect": total_effect, + "direct_effect_treated": direct_effect_treated, + "direct_effect_control": direct_effect_control, + "indirect_effect_treated": indirect_effect_treated, + "indirect_effect_control": indirect_effect_control, + } + return causal_effects diff --git a/src/med_bench/get_estimation.py b/src/med_bench/get_estimation.py deleted file mode 100644 index 4ee3ae6..0000000 --- a/src/med_bench/get_estimation.py +++ /dev/null @@ -1,728 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- - -import numpy as np - -from .mediation import ( - mediation_IPW, - mediation_coefficient_product, - mediation_g_formula, - mediation_multiply_robust, - mediation_dml, - r_mediation_g_estimator, - r_mediation_dml, - r_mediate, -) - - -def get_estimation(x, t, m, y, estimator, config): - """Wrapper estimator fonction ; calls an estimator given mediation data - in order to estimate total, direct, and indirect effects. - - Parameters - ---------- - x : array-like, shape (n_samples, n_features_covariates) - Covariates value for each unit - t : array-like, shape (n_samples) - Treatment value for each unit - m : array-like, shape (n_samples, n_features_mediator) - Mediator value for each unit - y : array-like, shape (n_samples) - Outcome value for each unit - estimator : str - Label of the estimator - config : int - Indicates whether the estimator is suited to the data. - Should be 1 if dim_m=1 and type_m="binary", 5 otherwise. - This is a legacy parameter, will be removed in future updates. - - Returns - ------- - list - A list of estimated effects : - [total effect, - direct effect on the exposed, - direct effect on the unexposed, - indirect effect on the exposed, - indirect effect on the unexposed, - number of discarded samples OR non-discarded samples] - - Raises - ------ - UserWarning - If estimator name is misspelled. - """ - effects = None - if estimator == "mediation_IPW_R": - x_r, t_r, m_r, y_r = [_convert_array_to_R(uu) for uu in (x, t, m, y)] - output_w = causalweight.medweight( - y=y_r, d=t_r, m=m_r, x=x_r, trim=0.0, ATET="FALSE", logit="TRUE", boot=2 - ) - raw_res_R = np.array(output_w.rx2("results")) - effects = raw_res_R[0, :] - elif estimator == "coefficient_product": - effects = mediation_coefficient_product(y, t, m, x) - elif estimator == "mediation_ipw_noreg": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=False, - forest=False, - crossfit=0, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_noreg_cf": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=False, - forest=False, - crossfit=2, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_reg": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=False, - crossfit=0, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_reg_cf": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=False, - crossfit=2, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_reg_calibration": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=False, - crossfit=0, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_reg_calibration_iso": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=False, - crossfit=0, - clip=1e-6, - calibration="isotonic", - ) - elif estimator == "mediation_ipw_reg_calibration_cf": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=False, - crossfit=2, - clip=1e-6, - calibration='sigmoid', - ) - elif estimator == "mediation_ipw_reg_calibration_iso_cf": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=False, - crossfit=2, - clip=1e-6, - calibration="isotonic", - ) - elif estimator == "mediation_ipw_forest": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=True, - crossfit=0, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_forest_cf": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=True, - crossfit=2, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_forest_calibration": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=True, - crossfit=0, - clip=1e-6, - calibration=None, - ) - elif estimator == "mediation_ipw_forest_calibration_iso": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=True, - crossfit=0, - clip=1e-6, - calibration="isotonic", - ) - elif estimator == "mediation_ipw_forest_calibration_cf": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=True, - crossfit=2, - clip=1e-6, - calibration='sigmoid', - ) - elif estimator == "mediation_ipw_forest_calibration_iso_cf": - effects = mediation_IPW( - y, - t, - m, - x, - trim=0, - regularization=True, - forest=True, - crossfit=2, - clip=1e-6, - calibration="isotonic", - ) - elif estimator == "mediation_g_computation_noreg": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=0, - regularization=False, - calibration=None, - ) - elif estimator == "mediation_g_computation_noreg_cf": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=2, - regularization=False, - calibration=None, - ) - elif estimator == "mediation_g_computation_reg": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=0, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_g_computation_reg_cf": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=2, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_g_computation_reg_calibration": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=0, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_g_computation_reg_calibration_iso": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=0, - regularization=True, - calibration="isotonic", - ) - elif estimator == "mediation_g_computation_reg_calibration_cf": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=2, - regularization=True, - calibration='sigmoid', - ) - elif estimator == "mediation_g_computation_reg_calibration_iso_cf": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=2, - regularization=True, - calibration="isotonic", - ) - elif estimator == "mediation_g_computation_forest": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=True, - crossfit=0, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_g_computation_forest_cf": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=True, - crossfit=2, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_g_computation_forest_calibration": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=True, - crossfit=0, - regularization=True, - calibration='sigmoid', - ) - elif estimator == "mediation_g_computation_forest_calibration_iso": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=True, - crossfit=0, - regularization=True, - calibration="isotonic", - ) - elif estimator == "mediation_g_computation_forest_calibration_cf": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=True, - crossfit=2, - regularization=True, - calibration='sigmoid', - ) - elif estimator == "mediation_g_computation_forest_calibration_iso_cf": - if config in (0, 1, 2): - effects = mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=True, - crossfit=2, - regularization=True, - calibration="isotonic", - ) - elif estimator == "mediation_multiply_robust_noreg": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=0, - clip=1e-6, - regularization=False, - calibration=None, - ) - elif estimator == "mediation_multiply_robust_noreg_cf": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=2, - clip=1e-6, - regularization=False, - calibration=None, - ) - elif estimator == "mediation_multiply_robust_reg": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=0, - clip=1e-6, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_multiply_robust_reg_cf": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=2, - clip=1e-6, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_multiply_robust_reg_calibration": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=0, - clip=1e-6, - regularization=True, - calibration='sigmoid', - ) - elif estimator == "mediation_multiply_robust_reg_calibration_iso": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=0, - clip=1e-6, - regularization=True, - calibration="isotonic", - ) - elif estimator == "mediation_multiply_robust_reg_calibration_cf": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=2, - clip=1e-6, - regularization=True, - calibration='sigmoid', - ) - elif estimator == "mediation_multiply_robust_reg_calibration_iso_cf": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=False, - crossfit=2, - clip=1e-6, - regularization=True, - calibration="isotonic", - ) - elif estimator == "mediation_multiply_robust_forest": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=True, - crossfit=0, - clip=1e-6, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_multiply_robust_forest_cf": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=True, - crossfit=2, - clip=1e-6, - regularization=True, - calibration=None, - ) - elif estimator == "mediation_multiply_robust_forest_calibration": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=True, - crossfit=0, - clip=1e-6, - regularization=True, - calibration='sigmoid', - ) - elif estimator == "mediation_multiply_robust_forest_calibration_iso": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=True, - crossfit=0, - clip=1e-6, - regularization=True, - calibration="isotonic", - ) - elif estimator == "mediation_multiply_robust_forest_calibration_cf": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=True, - crossfit=2, - clip=1e-6, - regularization=True, - calibration='sigmoid', - ) - elif estimator == "mediation_multiply_robust_forest_calibration_iso_cf": - if config in (0, 1, 2): - effects = mediation_multiply_robust( - y, - t, - m.astype(int), - x, - interaction=False, - forest=True, - crossfit=2, - clip=1e-6, - regularization=True, - calibration="isotonic", - ) - elif estimator == "simulation_based": - if config in (0, 1, 2): - effects = r_mediate(y, t, m, x, interaction=False) - elif estimator == "mediation_dml": - if config > 0: - effects = r_mediation_dml(y, t, m, x, trim=0.0, order=1) - elif estimator == "mediation_dml_noreg": - effects = mediation_dml( - y, - t, - m, - x, - trim=0, - clip=1e-6, - regularization=False, - calibration=None) - elif estimator == "mediation_dml_reg": - effects = mediation_dml( - y, t, m, x, trim=0, clip=1e-6, calibration=None) - elif estimator == "mediation_dml_reg_fixed_seed": - effects = mediation_dml( - y, t, m, x, trim=0, clip=1e-6, random_state=321, calibration=None) - elif estimator == "mediation_dml_noreg_cf": - effects = mediation_dml( - y, - t, - m, - x, - trim=0, - clip=1e-6, - crossfit=2, - regularization=False, - calibration=None) - elif estimator == "mediation_dml_reg_cf": - effects = mediation_dml( - y, t, m, x, trim=0, clip=1e-6, crossfit=2, calibration=None) - elif estimator == "mediation_dml_reg_calibration": - effects = mediation_dml( - y, t, m, x, trim=0, clip=1e-6, crossfit=0, calibration='sigmoid') - elif estimator == "mediation_dml_forest": - effects = mediation_dml( - y, - t, - m, - x, - trim=0, - clip=1e-6, - crossfit=0, - calibration=None, - forest=True) - elif estimator == "mediation_dml_forest_calibration": - effects = mediation_dml( - y, - t, - m, - x, - trim=0, - clip=1e-6, - crossfit=0, - calibration='sigmoid', - forest=True) - elif estimator == "mediation_dml_reg_calibration_cf": - effects = mediation_dml( - y, - t, - m, - x, - trim=0, - clip=1e-6, - crossfit=2, - calibration='sigmoid', - forest=False) - elif estimator == "mediation_dml_forest_cf": - effects = mediation_dml( - y, - t, - m, - x, - trim=0, - clip=1e-6, - crossfit=2, - calibration=None, - forest=True) - elif estimator == "mediation_dml_forest_calibration_cf": - effects = mediation_dml( - y, - t, - m, - x, - trim=0, - clip=1e-6, - crossfit=2, - calibration='sigmoid', - forest=True) - elif estimator == "mediation_g_estimator": - if config in (0, 1, 2): - effects = r_mediation_g_estimator(y, t, m, x) - else: - raise ValueError("Unrecognized estimator label.") - if effects is None: - if config not in (0, 1, 2): - raise ValueError("Estimator only supports 1D binary mediator.") - raise ValueError("Estimator does not support 1D binary mediator.") - return effects diff --git a/src/med_bench/mediation.py b/src/med_bench/mediation.py deleted file mode 100644 index 5d2071b..0000000 --- a/src/med_bench/mediation.py +++ /dev/null @@ -1,918 +0,0 @@ -""" -the objective of this script is to implement estimators for mediation in -causal inference, simulate data, and evaluate and compare estimators -""" - -# first step, run r code to have the original implementation by Huber -# using rpy2 to have the same data in R and python... - -import numpy as np -import pandas as pd -from sklearn.base import clone -from sklearn.linear_model import RidgeCV - - -from .utils.nuisances import (_estimate_conditional_mean_outcome, - _estimate_cross_conditional_mean_outcome, - _estimate_cross_conditional_mean_outcome_nesting, - _estimate_mediator_density, - _estimate_treatment_probabilities, - _get_classifier, _get_regressor) -from .utils.utils import r_dependency_required - -ALPHAS = np.logspace(-5, 5, 8) -CV_FOLDS = 5 -TINY = 1.e-12 - - -def mediation_IPW(y, t, m, x, trim, regularization=True, forest=False, - crossfit=0, clip=1e-6, calibration='sigmoid'): - """ - IPW estimator presented in - HUBER, Martin. Identifying causal mechanisms (primarily) based on inverse - probability weighting. Journal of Applied Econometrics, 2014, - vol. 29, no 6, p. 920-943. - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples, n_features_mediator) - mediator value for each unit, can be continuous or binary, and - multidimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - trim : float - Trimming rule for discarding observations with extreme propensity - scores. In the absence of post-treatment confounders (w=NULL), - observations with Pr(D=1|M,X)(1-trim) are - dropped. In the presence of post-treatment confounders - (w is defined), observations with Pr(D=1|M,W,X)(1-trim) are dropped. - - regularization : boolean, default=True - whether to use regularized models (logistic or - linear regression). If True, cross-validation is used - to chose among 8 potential log-spaced values between - 1e-5 and 1e5 - - forest : boolean, default=False - whether to use a random forest model to estimate the propensity - scores instead of logistic regression - - crossfit : integer, default=0 - number of folds for cross-fitting - - clip : float, default=1e-6 - limit to clip for numerical stability (min=clip, max=1-clip) - - calibration : str, default=sigmoid - calibration mode; for example using a sigmoid function - - Returns - ------- - float - total effect - float - direct effect treated (\theta(1)) - float - direct effect nontreated (\theta(0)) - float - indirect effect treated (\delta(1)) - float - indirect effect untreated (\delta(0)) - int - number of used observations (non trimmed) - """ - # estimate propensities - classifier_t_x = _get_classifier(regularization, forest, calibration) - classifier_t_xm = _get_classifier(regularization, forest, calibration) - p_x, p_xm = _estimate_treatment_probabilities(t, m, x, crossfit, - classifier_t_x, - classifier_t_xm) - - # trimming. Following causal weight code, not sure I understand - # why we trim only on p_xm and not on p_x - ind = ((p_xm > trim) & (p_xm < (1 - trim))) - y, t, p_x, p_xm = y[ind], t[ind], p_x[ind], p_xm[ind] - - # note on the names, ytmt' = Y(t, M(t')), the treatment needs to be - # binary but not the mediator - p_x = np.clip(p_x, clip, 1 - clip) - p_xm = np.clip(p_xm, clip, 1 - clip) - - # importance weighting - y1m1 = np.sum(y * t / p_x) / np.sum(t / p_x) - y1m0 = np.sum(y * t * (1 - p_xm) / (p_xm * (1 - p_x))) /\ - np.sum(t * (1 - p_xm) / (p_xm * (1 - p_x))) - y0m0 = np.sum(y * (1 - t) / (1 - p_x)) /\ - np.sum((1 - t) / (1 - p_x)) - y0m1 = np.sum(y * (1 - t) * p_xm / ((1 - p_xm) * p_x)) /\ - np.sum((1 - t) * p_xm / ((1 - p_xm) * p_x)) - - return (y1m1 - y0m0, - y1m1 - y0m1, - y1m0 - y0m0, - y1m1 - y1m0, - y0m1 - y0m0, - np.sum(ind)) - - -def mediation_coefficient_product(y, t, m, x, interaction=False, - regularization=True): - """ - found an R implementation https://cran.r-project.org/package=regmedint - - implements very simple model of mediation - Y ~ X + T + M - M ~ X + T - estimation method is product of coefficients - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples) - mediator value for each unit, can be continuous or binary, and - is necessary in 1D - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - interaction : boolean, default=False - whether to include interaction terms in the model - not implemented here, just for compatibility of signature - function - - regularization : boolean, default=True - whether to use regularized models (logistic or - linear regression). If True, cross-validation is used - to chose among 8 potential log-spaced values between - 1e-5 and 1e5 - - Returns - ------- - float - total effect - float - direct effect treated (\theta(1)) - float - direct effect nontreated (\theta(0)) - float - indirect effect treated (\delta(1)) - float - indirect effect untreated (\delta(0)) - int - number of used observations (non trimmed) - """ - if regularization: - alphas = ALPHAS - else: - alphas = [TINY] - if len(x.shape) == 1: - x = x.reshape(-1, 1) - if len(m.shape) == 1: - m = m.reshape(-1, 1) - if len(t.shape) == 1: - t = t.reshape(-1, 1) - coef_t_m = np.zeros(m.shape[1]) - for i in range(m.shape[1]): - m_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\ - .fit(np.hstack((x, t)), m[:, i]) - coef_t_m[i] = m_reg.coef_[-1] - y_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\ - .fit(np.hstack((x, t, m)), y.ravel()) - - # return total, direct and indirect effect - direct_effect = y_reg.coef_[x.shape[1]] - indirect_effect = sum(y_reg.coef_[x.shape[1] + 1:] * coef_t_m) - return (direct_effect + indirect_effect, - direct_effect, - direct_effect, - indirect_effect, - indirect_effect, - None) - - -def mediation_g_formula(y, t, m, x, interaction=False, forest=False, - crossfit=0, regularization=True, - calibration='sigmoid'): - """ - Warning : m needs to be binary - - implementation of the g formula for mediation - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples) - mediator value for each unit, here m is necessary binary and uni- - dimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - interaction : boolean, default=False - whether to include interaction terms in the model - interactions are terms XT, TM, MX - - forest : boolean, default=False - whether to use a random forest model to estimate the propensity - scores instead of logistic regression, and outcome model instead - of linear regression - - crossfit : integer, default=0 - number of folds for cross-fitting - - regularization : boolean, default=True - whether to use regularized models (logistic or - linear regression). If True, cross-validation is used - to chose among 8 potential log-spaced values between - 1e-5 and 1e5 - - calibration : str, default=sigmoid - calibration mode; for example using a sigmoid function - """ - # estimate mediator densities - classifier_m = _get_classifier(regularization, forest, calibration) - f_00x, f_01x, f_10x, f_11x, _, _ = _estimate_mediator_density(t, m, x, y, - crossfit, - classifier_m, - interaction) - - # estimate conditional mean outcomes - regressor_y = _get_regressor(regularization, forest) - mu_00x, mu_01x, mu_10x, mu_11x, _, _ = ( - _estimate_conditional_mean_outcome(t, m, x, y, crossfit, regressor_y, - interaction)) - - # G computation - direct_effect_i1 = mu_11x - mu_01x - direct_effect_i0 = mu_10x - mu_00x - n = len(y) - direct_effect_treated = (direct_effect_i1 * f_11x - + direct_effect_i0 * f_10x).sum() / n - direct_effect_control = (direct_effect_i1 * f_01x - + direct_effect_i0 * f_00x).sum() / n - indirect_effect_i1 = f_11x - f_01x - indirect_effect_i0 = f_10x - f_00x - indirect_effect_treated = (indirect_effect_i1 * mu_11x - + indirect_effect_i0 * mu_10x).sum() / n - indirect_effect_control = (indirect_effect_i1 * mu_01x - + indirect_effect_i0 * mu_00x).sum() / n - total_effect = direct_effect_control + indirect_effect_treated - - return (total_effect, - direct_effect_treated, - direct_effect_control, - indirect_effect_treated, - indirect_effect_control, - None) - - -def alternative_estimator(y, t, m, x, regularization=True): - """ - presented in - HUBER, Martin, LECHNER, Michael, et MELLACE, Giovanni. - The finite sample performance of estimators for mediation analysis under - sequential conditional independence. - Journal of Business & Economic Statistics, 2016, vol. 34, no 1, p. 139-160. - section 3.2.2 - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples) - mediator value for each unit, here m is necessary binary - and unidimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - regularization : boolean, default=True - whether to use regularized models (logistic or - linear regression). If True, cross-validation is used - to chose among 8 potential log-spaced values between - 1e-5 and 1e5 - """ - if regularization: - alphas = ALPHAS - else: - alphas = [TINY] - if len(x.shape) == 1: - x = x.reshape(-1, 1) - if len(m.shape) == 1: - m = m.reshape(-1, 1) - treated = (t == 1) - - # computation of direct effect - y_treated_reg_m = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit( - np.hstack((x[treated], m[treated])), y[treated]) - y_ctrl_reg_m = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit( - np.hstack((x[~treated], m[~treated])), y[~treated]) - xm = np.hstack((x, m)) - direct_effect = np.sum(y_treated_reg_m.predict(xm) - - y_ctrl_reg_m.predict(xm)) / len(y) - - # computation of total effect - y_treated_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit( - x[treated], y[treated]) - y_ctrl_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS).fit( - x[~treated], y[~treated]) - total_effect = np.sum(y_treated_reg.predict(x) - - y_ctrl_reg.predict(x)) / len(y) - - # computation of indirect effect - indirect_effect = total_effect - direct_effect - - return (total_effect, - direct_effect, - direct_effect, - indirect_effect, - indirect_effect, - None) - - -def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False, - crossfit=0, clip=1e-6, normalized=True, - regularization=True, calibration="sigmoid"): - """ - Presented in Eric J. Tchetgen Tchetgen. Ilya Shpitser. - "Semiparametric theory for causal mediation analysis: Efficiency bounds, - multiple robustness and sensitivity analysis." - Ann. Statist. 40 (3) 1816 - 1845, June 2012. - https://doi.org/10.1214/12-AOS990 - - Parameters - ---------- - y : array-like, shape (n_samples) - Outcome value for each unit, continuous - - t : array-like, shape (n_samples) - Treatment value for each unit, binary - - m : array-like, shape (n_samples) - Mediator value for each unit, binary and unidimensional - - x : array-like, shape (n_samples, n_features_covariates) - Covariates value for each unit, continuous - - interaction : boolean, default=False - Whether to include interaction terms in the model - interactions are terms XT, TM, MX - - forest : boolean, default=False - Whether to use a random forest model to estimate the propensity - scores instead of logistic regression, and outcome model instead - of linear regression - - crossfit : integer, default=0 - Number of folds for cross-fitting. If crossfit<2, no cross-fitting is - applied - - clip : float, default=1e-6 - Limit to clip p_x and f_mtx for numerical stability (min=clip, - max=1-clip) - - normalized : boolean, default=True - Normalizes the inverse probability-based weights so they add up to 1, - as described in "Identifying causal mechanisms (primarily) based on - inverse probability weighting", Huber (2014), - https://doi.org/10.1002/jae.2341 - - regularization : boolean, default=True - Whether to use regularized models (logistic or linear regression). - If True, cross-validation is used to chose among 8 potential - log-spaced values between 1e-5 and 1e5 - - calibration : str, default="sigmoid" - Which calibration method to use. - Implemented calibration methods are "sigmoid" and "isotonic". - - - Returns - ------- - total : float - Average total effect. - direct1 : float - Direct effect on the exposed. - direct0 : float - Direct effect on the unexposed, - indirect1 : float - Indirect effect on the exposed. - indirect0 : float - Indirect effect on the unexposed. - n_discarded : int - Number of discarded samples due to trimming. - - - Raises - ------ - ValueError - - If t or y are multidimensional. - - If x, t, m, or y don't have the same length. - - If m is not binary. - """ - # Format checking - if len(y) != len(y.ravel()): - raise ValueError("Multidimensional y is not supported") - if len(t) != len(t.ravel()): - raise ValueError("Multidimensional t is not supported") - if len(m) != len(m.ravel()): - raise ValueError("Multidimensional m is not supported") - - n = len(y) - if len(x.shape) == 1: - x.reshape(n, 1) - if len(m.shape) == 1: - m.reshape(n, 1) - - dim_m = m.shape[1] - if n * dim_m != sum(m.ravel() == 1) + sum(m.ravel() == 0): - raise ValueError("m is not binary") - - y = y.ravel() - t = t.ravel() - m = m.ravel() - if n != len(x) or n != len(m) or n != len(t): - raise ValueError("Inputs don't have the same number of observations") - - # estimate propensities - classifier_t_x = _get_classifier(regularization, forest, calibration) - p_x, _ = _estimate_treatment_probabilities(t, m, x, crossfit, - classifier_t_x, - clone(classifier_t_x)) - - # estimate mediator densities - classifier_m = _get_classifier(regularization, forest, calibration) - f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x = ( - _estimate_mediator_density(t, m, x, y, crossfit, - classifier_m, interaction)) - f = f_00x, f_01x, f_10x, f_11x - - # estimate conditional mean outcomes - regressor_y = _get_regressor(regularization, forest) - regressor_cross_y = _get_regressor(regularization, forest) - mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = ( - _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, - regressor_y, - regressor_cross_y, f, - interaction)) - - # clipping - p_x_clip = p_x != np.clip(p_x, clip, 1 - clip) - f_m0x_clip = f_m0x != np.clip(f_m0x, clip, 1 - clip) - f_m1x_clip = f_m1x != np.clip(f_m1x, clip, 1 - clip) - clipped = p_x_clip + f_m0x_clip + f_m1x_clip - - var_name = ["t", "y", "p_x", "f_m0x", "f_m1x", "mu_1mx", "mu_0mx"] - var_name += ["E_mu_t1_t1", "E_mu_t0_t0", "E_mu_t1_t0", "E_mu_t0_t1"] - n_discarded = 0 - for var in var_name: - exec(f"{var} = {var}[~clipped]") - n_discarded += np.sum(clipped) - - # score computing - if normalized: - sum_score_m1 = np.mean(t / p_x) - sum_score_m0 = np.mean((1 - t) / (1 - p_x)) - sum_score_t1m0 = np.mean((t / p_x) * (f_m0x / f_m1x)) - sum_score_t0m1 = np.mean((1 - t) / (1 - p_x) * (f_m1x / f_m0x)) - - y1m1 = (t / p_x * (y - E_mu_t1_t1)) / sum_score_m1 + E_mu_t1_t1 - y0m0 = (((1 - t) / (1 - p_x) * (y - E_mu_t0_t0)) / sum_score_m0 - + E_mu_t0_t0) - y1m0 = ( - ((t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx)) / sum_score_t1m0 - + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) / sum_score_m0 - + E_mu_t1_t0 - ) - y0m1 = ( - ((1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx)) - / sum_score_t0m1 + t / p_x * ( - mu_0mx - E_mu_t0_t1) / sum_score_m1 - + E_mu_t0_t1 - ) - else: - y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1 - y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0 - y1m0 = ( - (t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx) - + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) - + E_mu_t1_t0 - ) - y0m1 = ( - (1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx) - + t / p_x * (mu_0mx - E_mu_t0_t1) - + E_mu_t0_t1 - ) - - # effects computing - total = np.mean(y1m1 - y0m0) - direct1 = np.mean(y1m1 - y0m1) - direct0 = np.mean(y1m0 - y0m0) - indirect1 = np.mean(y1m1 - y1m0) - indirect0 = np.mean(y0m1 - y0m0) - - return total, direct1, direct0, indirect1, indirect0, n_discarded - - -@r_dependency_required(['mediation', 'stats', 'base']) -def r_mediate(y, t, m, x, interaction=False): - """ - This function calls the R function mediate from the package mediation - (https://cran.r-project.org/package=mediation) - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples) - mediator value for each unit, here m is necessary binary and - unidimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - interaction : boolean, default=False - whether to include interaction terms in the model - interactions are terms XT, TM, MX - """ - - import rpy2.robjects as robjects - import rpy2.robjects.packages as rpackages - from rpy2.robjects import numpy2ri, pandas2ri - - pandas2ri.activate() - numpy2ri.activate() - - mediation = rpackages.importr('mediation') - Rstats = rpackages.importr('stats') - base = rpackages.importr('base') - - m = m.ravel() - var_names = [[y, 'y'], - [t, 't'], - [m, 'm'], - [x, 'x']] - df_list = list() - for var, name in var_names: - if len(var.shape) > 1: - var_dim = var.shape[1] - col_names = ['{}_{}'.format(name, i) for i in range(var_dim)] - sub_df = pd.DataFrame(var, columns=col_names) - else: - sub_df = pd.DataFrame(var, columns=[name]) - df_list.append(sub_df) - df = pd.concat(df_list, axis=1) - m_features = [c for c in df.columns if ('y' not in c) and ('m' not in c)] - y_features = [c for c in df.columns if ('y' not in c)] - if not interaction: - m_formula = 'm ~ ' + ' + '.join(m_features) - y_formula = 'y ~ ' + ' + '.join(y_features) - else: - m_formula = 'm ~ ' + ' + '.join(m_features + - [':'.join(p) for p in - combinations(m_features, 2)]) - y_formula = 'y ~ ' + ' + '.join(y_features + - [':'.join(p) for p in - combinations(y_features, 2)]) - robjects.globalenv['df'] = df - mediator_model = Rstats.lm(m_formula, data=base.as_symbol('df')) - outcome_model = Rstats.lm(y_formula, data=base.as_symbol('df')) - res = mediation.mediate(mediator_model, outcome_model, treat='t', - mediator='m', boot=True, sims=1) - - relevant_variables = ['tau.coef', 'z1', 'z0', 'd1', 'd0'] - to_return = [np.array(res.rx2(v))[0] for v in relevant_variables] - return to_return + [None] - - -@r_dependency_required(['plmed', 'base']) -def r_mediation_g_estimator(y, t, m, x): - """ - This function calls the R G-estimator from the package plmed - (https://github.com/ohines/plmed) - """ - - import rpy2.robjects as robjects - import rpy2.robjects.packages as rpackages - from rpy2.robjects import numpy2ri, pandas2ri - - pandas2ri.activate() - numpy2ri.activate() - - plmed = rpackages.importr('plmed') - base = rpackages.importr('base') - - m = m.ravel() - var_names = [[y, 'y'], - [t, 't'], - [m, 'm'], - [x, 'x']] - df_list = list() - for var, name in var_names: - if len(var.shape) > 1: - var_dim = var.shape[1] - col_names = ['{}_{}'.format(name, i) for i in range(var_dim)] - sub_df = pd.DataFrame(var, columns=col_names) - else: - sub_df = pd.DataFrame(var, columns=[name]) - df_list.append(sub_df) - df = pd.concat(df_list, axis=1) - m_features = [c for c in df.columns if ('x' in c)] - y_features = [c for c in df.columns if ('x' in c)] - t_features = [c for c in df.columns if ('x' in c)] - m_formula = 'm ~ ' + ' + '.join(m_features) - y_formula = 'y ~ ' + ' + '.join(y_features) - t_formula = 't ~ ' + ' + '.join(t_features) - robjects.globalenv['df'] = df - res = plmed.G_estimation(t_formula, - m_formula, - y_formula, - exposure_family='binomial', - data=base.as_symbol('df')) - direct_effect = res.rx2('coef')[0] - indirect_effect = res.rx2('coef')[1] - return (direct_effect + indirect_effect, - direct_effect, - direct_effect, - indirect_effect, - indirect_effect, - None) - - -@r_dependency_required(['causalweight', 'base']) -def r_mediation_dml(y, t, m, x, trim=0.05, order=1): - """ - This function calls the R Double Machine Learning estimator from the - package causalweight (https://cran.r-project.org/web/packages/causalweight) - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples, n_features_mediator) - mediator value for each unit, can be continuous or binary, and - multi-dimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - trim : float, default=0.05 - Trimming rule for discarding observations with extreme - conditional treatment or mediator probabilities - (or products thereof). Observations with (products of) - conditional probabilities that are smaller than trim in any - denominator of the potential outcomes are dropped. - - order : integer, default=1 - If set to an integer larger than 1, then polynomials of that - order and interactions using the power series) rather than the - original control variables are used in the estimation of any - conditional probability or conditional mean outcome. - Polynomials/interactions are created using the Generate. - Powers command of the LARF package. - """ - - import rpy2.robjects.packages as rpackages - from rpy2.robjects import numpy2ri, pandas2ri - from .utils.utils import _convert_array_to_R - - pandas2ri.activate() - numpy2ri.activate() - - causalweight = rpackages.importr('causalweight') - base = rpackages.importr('base') - - x_r, t_r, m_r, y_r = [base.as_matrix(_convert_array_to_R(uu)) for uu in - (x, t, m, y)] - res = causalweight.medDML(y_r, t_r, m_r, x_r, trim=trim, order=order) - raw_res_R = np.array(res.rx2('results')) - ntrimmed = res.rx2('ntrimmed')[0] - return list(raw_res_R[0, :5]) + [ntrimmed] - - -def mediation_dml(y, t, m, x, forest=False, crossfit=0, trim=0.05, clip=1e-6, - normalized=True, regularization=True, random_state=None, - calibration=None): - """ - Python implementation of Double Machine Learning procedure, as described - in : - Helmut Farbmacher and others, Causal mediation analysis with double - machine learning, - The Econometrics Journal, Volume 25, Issue 2, May 2022, Pages 277–300, - https://doi.org/10.1093/ectj/utac003 - - Parameters - ---------- - - y : array-like, shape (n_samples) - Outcome value for each unit. - - t : array-like, shape (n_samples) - Treatment value for each unit. - - m : array-like, shape (n_samples, n_features_mediator) - Mediator value for each unit, multidimensional or continuous. - - x : array-like, shape (n_samples, n_features_covariates) - Covariates value for each unit, multidimensional or continuous. - - forest : boolean, default=False - Whether to use a random forest model to estimate the propensity - scores instead of logistic regression, and outcome model instead - of linear regression. - - crossfit : int, default=0 - Number of folds for cross-fitting. - - trim : float, default=0.05 - Trimming treshold for discarding observations with extreme probability. - - clip : float, default=1e-6 - limit to clip for numerical stability (min=clip, max=1-clip) - - normalized : boolean, default=True - Normalizes the inverse probability-based weights so they add up to 1, - as described in "Identifying causal mechanisms (primarily) based on - inverse probability weighting", - Huber (2014), https://doi.org/10.1002/jae.2341 - - regularization : boolean, default=True - Whether to use regularized models (logistic or linear regression). - If True, cross-validation is used to chose among 8 potential - log-spaced values between 1e-5 and 1e5. - - random_state : int, default=None - LogisticRegression random state instance. - - calibration : {None, "sigmoid", "isotonic"}, default=None - Whether to add a calibration step for the classifier used to estimate - the treatment propensity score and P(T|M,X). "None" means no - calibration. - Calibration ensures the output of the [predict_proba] - (https://scikit-learn.org/stable/glossary.html#term-predict_proba) - method can be directly interpreted as a confidence level. - Implemented calibration methods are "sigmoid" and "isotonic". - - Returns - ------- - total : float - Average total effect. - direct1 : float - Direct effect on the exposed. - direct0 : float - Direct effect on the unexposed, - indirect1 : float - Indirect effect on the exposed. - indirect0 : float - Indirect effect on the unexposed. - n_discarded : int - Number of discarded samples due to trimming. - - Raises - ------ - ValueError - - If t or y are multidimensional. - - If x, t, m, or y don't have the same length. - """ - # check format - if len(y) != len(y.ravel()): - raise ValueError("Multidimensional y is not supported") - - if len(t) != len(t.ravel()): - raise ValueError("Multidimensional t is not supported") - - n = len(y) - t = t.ravel() - y = y.ravel() - - if n != len(x) or n != len(m) or n != len(t): - raise ValueError("Inputs don't have the same number of observations") - - if len(x.shape) == 1: - x.reshape(n, 1) - - if len(m.shape) == 1: - m.reshape(n, 1) - - nobs = 0 - - var_name = [ - "p_x", - "p_xm", - "mu_1mx", - "mu_0mx", - "E_mu_t1_t0", - "E_mu_t0_t1", - "E_mu_t1_t1", - "E_mu_t0_t0", - ] - - # estimate propensities - classifier_t_x = _get_classifier(regularization, forest, calibration) - classifier_t_xm = _get_classifier(regularization, forest, calibration) - p_x, p_xm = _estimate_treatment_probabilities(t, m, x, crossfit, - classifier_t_x, - classifier_t_xm) - - # estimate conditional mean outcomes - regressor_y = _get_regressor(regularization, forest) - regressor_cross_y = _get_regressor(regularization, forest) - - mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = ( - _estimate_cross_conditional_mean_outcome_nesting(t, m, x, y, crossfit, - regressor_y, - regressor_cross_y)) - - # trimming - not_trimmed = ( - (((1 - p_xm) * p_x) >= trim) - * ((1 - p_x) >= trim) - * (p_x >= trim) - * ((p_xm * (1 - p_x)) >= trim) - ) - for var in var_name: - exec(f"{var} = {var}[not_trimmed]") - nobs = np.sum(not_trimmed) - - # clipping - p_x = np.clip(p_x, clip, 1 - clip) - p_xm = np.clip(p_xm, clip, 1 - clip) - - # score computing - if normalized: - sum_score_m1 = np.mean(t / p_x) - sum_score_m0 = np.mean((1 - t) / (1 - p_x)) - sum_score_t1m0 = np.mean(t * (1 - p_xm) / (p_xm * (1 - p_x))) - sum_score_t0m1 = np.mean((1 - t) * p_xm / ((1 - p_xm) * p_x)) - y1m1 = (t / p_x * (y - E_mu_t1_t1)) / sum_score_m1 + E_mu_t1_t1 - y0m0 = (((1 - t) / (1 - p_x) * (y - E_mu_t0_t0)) / sum_score_m0 - + E_mu_t0_t0) - y1m0 = ( - (t * (1 - p_xm) / (p_xm * (1 - p_x)) * (y - mu_1mx)) - / sum_score_t1m0 + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) - / sum_score_m0 + E_mu_t1_t0 - ) - y0m1 = ( - ((1 - t) * p_xm / ((1 - p_xm) * p_x) * (y - mu_0mx)) - / sum_score_t0m1 - + (t / p_x * (mu_0mx - E_mu_t0_t1)) / sum_score_m1 - + E_mu_t0_t1 - ) - else: - y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1 - y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0 - y1m0 = ( - t * (1 - p_xm) / (p_xm * (1 - p_x)) * (y - mu_1mx) - + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) - + E_mu_t1_t0 - ) - y0m1 = ( - (1 - t) * p_xm / ((1 - p_xm) * p_x) * (y - mu_0mx) - + t / p_x * (mu_0mx - E_mu_t0_t1) - + E_mu_t0_t1 - ) - - # mean score computing - my1m1 = np.mean(y1m1) - my0m0 = np.mean(y0m0) - my1m0 = np.mean(y1m0) - my0m1 = np.mean(y0m1) - - # effects computing - total = my1m1 - my0m0 - direct1 = my1m1 - my0m1 - direct0 = my1m0 - my0m0 - indirect1 = my1m1 - my1m0 - indirect0 = my0m1 - my0m0 - return total, direct1, direct0, indirect1, indirect0, n - nobs diff --git a/src/med_bench/utils/constants.py b/src/med_bench/utils/constants.py index 417f993..396e54d 100644 --- a/src/med_bench/utils/constants.py +++ b/src/med_bench/utils/constants.py @@ -5,105 +5,81 @@ # CONSTANTS USED FOR TESTS # TOLERANCE THRESHOLDS - -TOLERANCE_THRESHOLDS = { - "SMALL": { - "ATE": 0.05, - "DIRECT": 0.05, - "INDIRECT": 0.2, - }, - "MEDIUM": { - "ATE": 0.10, - "DIRECT": 0.10, - "INDIRECT": 0.4, - }, - "LARGE": { - "ATE": 0.15, - "DIRECT": 0.15, - "INDIRECT": 0.9, - }, - "INFINITE": { - "ATE": np.inf, - "DIRECT": np.inf, - "INDIRECT": np.inf, - }, -} - - -def get_tolerance_array(tolerance_size: str) -> np.array: - """Get tolerance array for tolerance tests - - Parameters - ---------- - tolerance_size : str - tolerance size, can be "SMALL", "MEDIUM", "LARGE" or "INFINITE" - - Returns - ------- - np.array - array of size 5 containing the ATE, DIRECT (*2) and INDIRECT (*2) effects tolerance - """ - - return np.array( - [ - TOLERANCE_THRESHOLDS[tolerance_size]["ATE"], - TOLERANCE_THRESHOLDS[tolerance_size]["DIRECT"], - TOLERANCE_THRESHOLDS[tolerance_size]["DIRECT"], - TOLERANCE_THRESHOLDS[tolerance_size]["INDIRECT"], - TOLERANCE_THRESHOLDS[tolerance_size]["INDIRECT"], - ] - ) - - -SMALL_TOLERANCE = get_tolerance_array("SMALL") -MEDIUM_TOLERANCE = get_tolerance_array("MEDIUM") -LARGE_TOLERANCE = get_tolerance_array("LARGE") -INFINITE_TOLERANCE = get_tolerance_array("INFINITE") - -TOLERANCE_DICT = { - "coefficient_product": LARGE_TOLERANCE, - "mediation_ipw_noreg": INFINITE_TOLERANCE, - "mediation_ipw_reg": INFINITE_TOLERANCE, - "mediation_ipw_reg_calibration": INFINITE_TOLERANCE, - "mediation_ipw_forest": INFINITE_TOLERANCE, - "mediation_ipw_forest_calibration": INFINITE_TOLERANCE, - "mediation_g_computation_noreg": LARGE_TOLERANCE, - "mediation_g_computation_reg": MEDIUM_TOLERANCE, - "mediation_g_computation_reg_calibration": LARGE_TOLERANCE, - "mediation_g_computation_forest": LARGE_TOLERANCE, - "mediation_g_computation_forest_calibration": INFINITE_TOLERANCE, - "mediation_multiply_robust_noreg": INFINITE_TOLERANCE, - "mediation_multiply_robust_reg": LARGE_TOLERANCE, - "mediation_multiply_robust_reg_calibration": LARGE_TOLERANCE, - "mediation_multiply_robust_forest": INFINITE_TOLERANCE, - "mediation_multiply_robust_forest_calibration": LARGE_TOLERANCE, - "simulation_based": LARGE_TOLERANCE, - "mediation_dml": INFINITE_TOLERANCE, - "mediation_dml_reg_fixed_seed": INFINITE_TOLERANCE, - "mediation_g_estimator": LARGE_TOLERANCE, - "mediation_ipw_noreg_cf": INFINITE_TOLERANCE, - "mediation_ipw_reg_cf": INFINITE_TOLERANCE, - "mediation_ipw_reg_calibration_cf": INFINITE_TOLERANCE, - "mediation_ipw_forest_cf": INFINITE_TOLERANCE, - "mediation_ipw_forest_calibration_cf": INFINITE_TOLERANCE, - "mediation_g_computation_noreg_cf": SMALL_TOLERANCE, - "mediation_g_computation_reg_cf": LARGE_TOLERANCE, - "mediation_g_computation_reg_calibration_cf": LARGE_TOLERANCE, - "mediation_g_computation_forest_cf": INFINITE_TOLERANCE, - "mediation_g_computation_forest_calibration_cf": LARGE_TOLERANCE, - "mediation_multiply_robust_noreg_cf": MEDIUM_TOLERANCE, - "mediation_multiply_robust_reg_cf": LARGE_TOLERANCE, - "mediation_multiply_robust_reg_calibration_cf": MEDIUM_TOLERANCE, - "mediation_multiply_robust_forest_cf": INFINITE_TOLERANCE, - "mediation_multiply_robust_forest_calibration_cf": INFINITE_TOLERANCE, -} - -ESTIMATORS = list(TOLERANCE_DICT.keys()) - -R_DEPENDENT_ESTIMATORS = [ - "mediation_IPW_R", "simulation_based", "mediation_dml", "mediation_g_estimator" +DEFAULT_TOLERANCE = np.array([0.05, 0.05, 0.05, 0.2, 0.2]) + +TOLERANCE_FACTOR_DICT = { + "coefficient_product-M1D_binary_1DX": np.array([1, 1, 1, 4, 4]), + "coefficient_product-M1D_binary_5DX": np.array([1, 1, 1, 3.5, 3.5]), + "coefficient_product-M5D_continuous_1DX": np.array([1, 1, 1, 1.5, 1.5]), + "coefficient_product-M5D_continuous_5DX": np.array([1, 1, 1, 3.5, 3.5]), + "mediation_ipw_reg-M1D_binary_1DX": np.array([6, 1, 1, 100, 100]), + "mediation_ipw_reg-M1D_binary_5DX": np.array([2, 1.2, 1.2, 10, 10]), + "mediation_ipw_reg-M1D_continuous_1DX": np.array([6, 1.2, 1.2, 15, 15]), + "mediation_ipw_reg-M5D_continuous_1DX": np.array([6, 15, 15, 20, 20]), + "mediation_ipw_reg-M5D_continuous_5DX": np.array([2, 5, 5, 10, 10]), + "mediation_ipw_reg_calibration-M1D_binary_1DX": np.array([2, 2, 2, 10, 10]), + "mediation_ipw_reg_calibration-M1D_binary_5DX": np.array([1, 1, 1, 5, 5]), + "mediation_ipw_reg_calibration-M5D_continuous_1DX": np.array([1, 4, 4, 10, 10]), + "mediation_ipw_reg_calibration-M1D_continuous_5DX": np.array([1, 1, 1, 2, 2]), + "mediation_ipw_reg_calibration-M5D_continuous_5DX": np.array([1, 6, 6, 15, 15]), + "mediation_g_computation_reg-M1D_binary_5DX": np.array([2, 2, 2, 3, 3]), + "mediation_g_computation_reg-M1D_continuous_1DX": np.array([1, 1, 1, 1.5, 1.5]), + "mediation_g_computation_reg-M5D_continuous_1DX": np.array([1, 1, 1, 1.5, 1.5]), + "mediation_g_computation_reg-M1D_continuous_5DX": np.array([2, 2, 2, 4, 4]), + "mediation_g_computation_reg-M5D_continuous_5DX": np.array([1, 3, 3, 6, 6]), + "mediation_g_computation_reg_calibration-M1D_binary_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_g_computation_reg_calibration-M1D_binary_5DX": np.array([1, 1, 1, 1.5, 1.5]), + "mediation_g_computation_reg_calibration-M1D_continuous_1DX": np.array([1, 2, 2, 4, 4]), + "mediation_g_computation_reg_calibration-M5D_continuous_1DX": np.array([1, 2, 2, 2.5, 2.5]), + "mediation_g_computation_reg_calibration-M1D_continuous_5DX": np.array([1, 2, 2, 5, 5]), + "mediation_g_computation_reg_calibration-M5D_continuous_5DX": np.array([6, 15, 15, 20, 20]), + "mediation_multiply_robust_reg-M1D_binary_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_multiply_robust_reg-M1D_binary_5DX": np.array([1, 1, 1, 2, 2]), + "mediation_multiply_robust_reg-M1D_continuous_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_multiply_robust_reg-M5D_continuous_1DX": np.array([1, 3, 3, 6, 6]), + "mediation_multiply_robust_reg-M1D_continuous_5DX": np.array([1, 1, 1, 2, 2]), + "mediation_multiply_robust_reg-M5D_continuous_5DX": np.array([1, 2, 2, 4, 4]), + "mediation_multiply_robust_reg_calibration-M1D_binary_1DX": np.array([1, 1, 1, 3, 3]), + "mediation_multiply_robust_reg_calibration-M1D_binary_5DX": np.array([1, 1, 1, 4, 4]), + "mediation_multiply_robust_reg_calibration-M1D_continuous_1DX": np.array([2, 2, 2, 3, 3]), + "mediation_multiply_robust_reg_calibration-M5D_continuous_1DX": np.array([2, 2, 2, 5, 5]), + "mediation_multiply_robust_reg_calibration-M1D_continuous_5DX": np.array([1, 1, 1, 2, 2]), + "mediation_multiply_robust_reg_calibration-M5D_continuous_5DX": np.array([1, 3, 3, 4, 4]), + "mediation_dml_reg-M1D_binary_1DX": np.array([1, 2, 2, 6, 6]), + "mediation_dml_reg-M1D_binary_5DX": np.array([1, 1, 1, 5, 5]), + "mediation_dml_reg-M5D_continuous_1DX": np.array([1, 10, 10, 20, 20]), + "mediation_dml_reg-M5D_continuous_5DX": np.array([1, 3, 3, 5, 5]), + "mediation_dml_reg_calibration-M1D_binary_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_dml_reg_calibration-M1D_continuous_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_dml_reg_calibration-M5D_continuous_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_dml_reg_calibration-M1D_continuous_5DX": np.array([1, 1, 1, 2, 2]), + "mediation_dml_reg_calibration-M5D_continuous_5DX": np.array([1, 3, 3, 6, 6]), + "mediation_tmle_propensities-M1D_binary_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_tmle_propensities-M1D_continuous_1DX": np.array([1, 1, 1, 2, 2]), + "mediation_tmle_propensities-M5D_continuous_1DX": np.array([1, 2, 2, 2, 2]), + "mediation_tmle_propensities-M1D_continuous_5DX": np.array([1, 1, 1, 3, 3]), + "mediation_tmle_propensities-M5D_continuous_5DX": np.array([3, 3, 3, 15, 15]), + "mediation_tmle_density-M1D_binary_1DX": np.array([1, 1, 1, 3, 3]), + "mediation_tmle_density-M1D_binary_5DX": np.array([1, 1, 1, 2, 2]), +} + + + +ESTIMATORS = [ + "coefficient_product", + "mediation_ipw_reg", + "mediation_ipw_reg_calibration", + "mediation_g_computation_reg", + "mediation_g_computation_reg_calibration", + "mediation_multiply_robust_reg", + "mediation_multiply_robust_reg_calibration", + "mediation_dml_reg", + "mediation_dml_reg_calibration", + "mediation_tmle_propensities", + "mediation_tmle_density" ] + # PARAMETERS VALUES FOR DATA GENERATION PARAMETER_NAME = [ @@ -156,3 +132,20 @@ def get_tolerance_array(tolerance_size: str) -> np.array: ) ) ) + +CONFIGURATION_NAMES = ["M1D_binary_1DX", + "M1D_binary_5DX", + "M1D_continuous_1DX", + "M5D_continuous_1DX", + "M1D_continuous_5DX", + "M5D_continuous_5DX"] + +CONFIG_DICT = {CONFIGURATION_NAMES[i]: + dict(zip(PARAMETER_NAME, PARAMETER_LIST[i])) + for i in range(len(CONFIGURATION_NAMES))} + + + +ALPHAS = np.logspace(-5, 5, 8) +CV_FOLDS = 5 +TINY = 1.0e-12 diff --git a/src/med_bench/utils/decorators.py b/src/med_bench/utils/decorators.py new file mode 100644 index 0000000..9e0b286 --- /dev/null +++ b/src/med_bench/utils/decorators.py @@ -0,0 +1,53 @@ +import time + + +def timeit(func): + def timed(*args, **kwargs): + ts = time.time() + result = func(*args, **kwargs) + te = time.time() + + if 'log_time' in kwargs: + name = kwargs.get('log_name', func.__name__.upper()) + kwargs['log_time'][name] = int((te - ts) * 1000) + else: + print('%r %2.2f ms' % (func.__name__, (te - ts) * 1000)) + return result + return timed + + +def accepts(*types): + def check_accepts(func): + assert len(types) == func.__code__.co_argcount - 1 + + def wrapper(self, *args, **kwargs): + for (a, t) in zip(args, types): + assert isinstance(a, t), \ + "arg %r does not match %s" % (a, t) + return func(self, *args, **kwargs) + wrapper.__name__ = func.__name__ + return wrapper + return check_accepts + + +def accepts_(*types): + def check_accepts(func): + assert len(types) == func.__code__.co_argcount + + def wrapper(*args, **kwargs): + for (a, t) in zip(args, types): + assert isinstance(a, t), \ + "arg %r does not match %s" % (a, t) + return func(*args, **kwargs) + wrapper.__name__ = func.__name__ + return wrapper + return check_accepts + + +def fitted(func): + def wrapper(self, *args, **kwargs): + if self._fitted: + return func(self, *args, **kwargs) + else: + raise RuntimeError("Model not fitted yet") + return wrapper \ No newline at end of file diff --git a/src/med_bench/utils/nuisances.py b/src/med_bench/utils/nuisances.py deleted file mode 100644 index ded68f0..0000000 --- a/src/med_bench/utils/nuisances.py +++ /dev/null @@ -1,485 +0,0 @@ -""" -the objective of this script is to implement nuisances functions -used in mediation estimators in causal inference -""" -import numpy as np -from sklearn.base import clone -from sklearn.calibration import CalibratedClassifierCV -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegressionCV, RidgeCV -from sklearn.model_selection import KFold - -from .utils import check_r_dependencies, _get_interactions - -if check_r_dependencies(): - from .utils import _convert_array_to_R - - -ALPHAS = np.logspace(-5, 5, 8) -CV_FOLDS = 5 -TINY = 1.e-12 - - -def _get_train_test_lists(crossfit, n, x): - """ - Obtain train and test folds - - Returns - ------- - train_test_list : list - indexes with train and test indexes - """ - if crossfit < 2: - train_test_list = [[np.arange(n), np.arange(n)]] - else: - kf = KFold(n_splits=crossfit) - train_test_list = list() - for train_index, test_index in kf.split(x): - train_test_list.append([train_index, test_index]) - return train_test_list - - -def _get_regularization_parameters(regularization): - """ - Obtain regularization parameters - - Returns - ------- - cs : list - each of the values in Cs describes the inverse of regularization - strength for predictors - alphas : list - alpha values to try in ridge models - """ - if regularization: - alphas = ALPHAS - cs = ALPHAS - else: - alphas = [TINY] - cs = [np.inf] - - return cs, alphas - - -def _get_classifier(regularization, forest, calibration, random_state=42): - """ - Obtain context classifiers to estimate treatment probabilities. - - Returns - ------- - clf : classifier on contexts, etc. for predicting P(T=1|X), - P(T=1|X, M) or f(M|T,X) - """ - cs, _ = _get_regularization_parameters(regularization) - - if not forest: - clf = LogisticRegressionCV(random_state=random_state, Cs=cs, - cv=CV_FOLDS) - else: - clf = RandomForestClassifier(random_state=random_state, - n_estimators=100, min_samples_leaf=10) - if calibration in {"sigmoid", "isotonic"}: - clf = CalibratedClassifierCV(clf, method=calibration) - - return clf - - -def _get_regressor(regularization, forest, random_state=42): - """ - Obtain regressors to estimate conditional mean outcomes. - - Returns - ------- - reg : regressor on contexts, etc. for predicting E[Y|T,M,X], etc. - """ - _, alphas = _get_regularization_parameters(regularization) - - if not forest: - reg = RidgeCV(alphas=alphas, cv=CV_FOLDS) - else: - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=random_state) - - return reg - - -def _estimate_treatment_probabilities(t, m, x, crossfit, clf_t_x, clf_t_xm): - """ - Estimate treatment probabilities P(T=1|X) and P(T=1|X, M) with train - test lists from crossfitting - - Returns - ------- - p_x : array-like, shape (n_samples) - probabilities P(T=1|X) - p_xm : array-like, shape (n_samples) - probabilities P(T=1|X, M) - """ - n = len(t) - - p_x, p_xm = [np.zeros(n) for h in range(2)] - # compute propensity scores - if len(x.shape) == 1: - x = x.reshape(-1, 1) - if len(m.shape) == 1: - m = m.reshape(-1, 1) - if len(t.shape) == 1: - t = t.reshape(-1, 1) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - xm = np.hstack((x, m)) - - for train_index, test_index in train_test_list: - - # p_x, p_xm model fitting - clf_t_x = clf_t_x.fit(x[train_index, :], t[train_index]) - clf_t_xm = clf_t_xm.fit(xm[train_index, :], t[train_index]) - - # predict P(T=1|X), P(T=1|X, M) - p_x[test_index] = clf_t_x.predict_proba(x[test_index, :])[:, 1] - p_xm[test_index] = clf_t_xm.predict_proba(xm[test_index, :])[:, 1] - - return p_x, p_xm - - -def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction): - """ - Estimate mediator density f(M|T,X) - with train test lists from crossfitting - - Returns - ------- - f_00x: array-like, shape (n_samples) - probabilities f(M=0|T=0,X) - f_01x, array-like, shape (n_samples) - probabilities f(M=0|T=1,X) - f_10x, array-like, shape (n_samples) - probabilities f(M=1|T=0,X) - f_11x, array-like, shape (n_samples) - probabilities f(M=1|T=1,X) - f_m0x, array-like, shape (n_samples) - probabilities f(M|T=0,X) - f_m1x, array-like, shape (n_samples) - probabilities f(M|T=1,X) - """ - n = len(y) - if len(x.shape) == 1: - x = x.reshape(-1, 1) - - if len(t.shape) == 1: - t = t.reshape(-1, 1) - - t0 = np.zeros((n, 1)) - t1 = np.ones((n, 1)) - - m = m.ravel() - - train_test_list = _get_train_test_lists(crossfit, n, x) - - f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x = [np.zeros(n) for _ in range(6)] - - t_x = _get_interactions(interaction, t, x) - - t0_x = _get_interactions(interaction, t0, x) - t1_x = _get_interactions(interaction, t1, x) - - for train_index, test_index in train_test_list: - - test_ind = np.arange(len(test_index)) - - # f_mtx model fitting - clf_m = clf_m.fit(t_x[train_index, :], m[train_index]) - - # predict f(M=m|T=t,X) - fm_0 = clf_m.predict_proba(t0_x[test_index, :]) - f_00x[test_index] = fm_0[:, 0] - f_01x[test_index] = fm_0[:, 1] - fm_1 = clf_m.predict_proba(t1_x[test_index, :]) - f_10x[test_index] = fm_1[:, 0] - f_11x[test_index] = fm_1[:, 1] - - # predict f(M|T=t,X) - f_m0x[test_index] = fm_0[test_ind, m[test_index].astype(int)] - f_m1x[test_index] = fm_1[test_ind, m[test_index].astype(int)] - - return f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x - - -def _estimate_conditional_mean_outcome(t, m, x, y, crossfit, reg_y, - interaction): - """ - Estimate conditional mean outcome E[Y|T,M,X] - with train test lists from crossfitting - - Returns - ------- - mu_00x: array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M=0,X] - mu_01x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M=1,X] - mu_10x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M=0,X] - mu_11x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M=1,X] - mu_m0x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M,X] - mu_m1x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M,X] - """ - n = len(y) - if len(x.shape) == 1: - x = x.reshape(-1, 1) - if len(m.shape) == 1: - mr = m.reshape(-1, 1) - else: - mr = np.copy(m) - if len(t.shape) == 1: - t = t.reshape(-1, 1) - - t0 = np.zeros((n, 1)) - t1 = np.ones((n, 1)) - m0 = np.zeros((n, 1)) - m1 = np.ones((n, 1)) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - mu_11x, mu_10x, mu_01x, mu_00x, mu_1mx, mu_0mx = [np.zeros(n) for _ in - range(6)] - - x_t_mr = _get_interactions(interaction, x, t, mr) - - x_t1_m1 = _get_interactions(interaction, x, t1, m1) - x_t1_m0 = _get_interactions(interaction, x, t1, m0) - x_t0_m1 = _get_interactions(interaction, x, t0, m1) - x_t0_m0 = _get_interactions(interaction, x, t0, m0) - - x_t1_m = _get_interactions(interaction, x, t1, m) - x_t0_m = _get_interactions(interaction, x, t0, m) - - for train_index, test_index in train_test_list: - - # mu_tm model fitting - reg_y = reg_y.fit(x_t_mr[train_index, :], y[train_index]) - - # predict E[Y|T=t,M=m,X] - mu_00x[test_index] = reg_y.predict(x_t0_m0[test_index, :]) - mu_01x[test_index] = reg_y.predict(x_t0_m1[test_index, :]) - mu_10x[test_index] = reg_y.predict(x_t1_m0[test_index, :]) - mu_11x[test_index] = reg_y.predict(x_t1_m1[test_index, :]) - - # predict E[Y|T=t,M,X] - mu_0mx[test_index] = reg_y.predict(x_t0_m[test_index, :]) - mu_1mx[test_index] = reg_y.predict(x_t1_m[test_index, :]) - - return mu_00x, mu_01x, mu_10x, mu_11x, mu_0mx, mu_1mx - - -def _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, reg_y, - reg_cross_y, f, interaction): - """ - Estimate the conditional mean outcome, - the cross conditional mean outcome - - Returns - ------- - mu_m0x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M,X] - mu_m1x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M,X] - E_mu_t0_t0, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=0,X] - E_mu_t0_t1, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=1,X] - E_mu_t1_t0, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=0,X] - E_mu_t1_t1, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=1,X] - """ - n = len(y) - - # Initialisation - ( - mu_1mx, # E[Y|T=1,M,X] - mu_0mx, # E[Y|T=0,M,X] - mu_11x, # E[Y|T=1,M=1,X] - mu_10x, # E[Y|T=1,M=0,X] - mu_01x, # E[Y|T=0,M=1,X] - mu_00x, # E[Y|T=0,M=0,X] - E_mu_t0_t0, # E[E[Y|T=0,M,X]|T=0,X] - E_mu_t0_t1, # E[E[Y|T=0,M,X]|T=1,X] - E_mu_t1_t0, # E[E[Y|T=1,M,X]|T=0,X] - E_mu_t1_t1, # E[E[Y|T=1,M,X]|T=1,X] - ) = [np.zeros(n) for _ in range(10)] - - t0, m0 = np.zeros((n, 1)), np.zeros((n, 1)) - t1, m1 = np.ones((n, 1)), np.ones((n, 1)) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - x_t_m = _get_interactions(interaction, x, t, m) - x_t1_m = _get_interactions(interaction, x, t1, m) - x_t0_m = _get_interactions(interaction, x, t0, m) - - x_t0_m0 = _get_interactions(interaction, x, t0, m0) - x_t0_m1 = _get_interactions(interaction, x, t0, m1) - x_t1_m0 = _get_interactions(interaction, x, t1, m0) - x_t1_m1 = _get_interactions(interaction, x, t1, m1) - - f_00x, f_01x, f_10x, f_11x = f - - # Cross-fitting loop - for train_index, test_index in train_test_list: - # Index declaration - ind_t0 = t[test_index] == 0 - - # mu_tm model fitting - reg_y = reg_y.fit(x_t_m[train_index, :], y[train_index]) - - # predict E[Y|T=t,M,X] - mu_1mx[test_index] = reg_y.predict(x_t1_m[test_index, :]) - mu_0mx[test_index] = reg_y.predict(x_t0_m[test_index, :]) - - # predict E[Y|T=t,M=m,X] - mu_00x[test_index] = reg_y.predict(x_t0_m0[test_index, :]) - mu_01x[test_index] = reg_y.predict(x_t0_m1[test_index, :]) - mu_11x[test_index] = reg_y.predict(x_t1_m1[test_index, :]) - mu_10x[test_index] = reg_y.predict(x_t1_m0[test_index, :]) - - # E[E[Y|T=1,M=m,X]|T=t,X] model fitting - reg_y_t1m1_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_11x[test_index][ind_t0] - ) - reg_y_t1m0_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_10x[test_index][ind_t0] - ) - reg_y_t1m1_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_11x[test_index][~ind_t0] - ) - reg_y_t1m0_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_10x[test_index][~ind_t0] - ) - - # predict E[E[Y|T=1,M=m,X]|T=t,X] - E_mu_t1_t0[test_index] = ( - reg_y_t1m0_t0.predict(x[test_index, :]) * f_00x[test_index] - + reg_y_t1m1_t0.predict(x[test_index, :]) * f_01x[test_index] - ) - E_mu_t1_t1[test_index] = ( - reg_y_t1m0_t1.predict(x[test_index, :]) * f_10x[test_index] - + reg_y_t1m1_t1.predict(x[test_index, :]) * f_11x[test_index] - ) - - # E[E[Y|T=0,M=m,X]|T=t,X] model fitting - reg_y_t0m1_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_01x[test_index][ind_t0] - ) - reg_y_t0m0_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_00x[test_index][ind_t0] - ) - reg_y_t0m1_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_01x[test_index][~ind_t0] - ) - reg_y_t0m0_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_00x[test_index][~ind_t0] - ) - - # predict E[E[Y|T=0,M=m,X]|T=t,X] - E_mu_t0_t0[test_index] = ( - reg_y_t0m0_t0.predict(x[test_index, :]) * f_00x[test_index] - + reg_y_t0m1_t0.predict(x[test_index, :]) * f_01x[test_index] - ) - E_mu_t0_t1[test_index] = ( - reg_y_t0m0_t1.predict(x[test_index, :]) * f_10x[test_index] - + reg_y_t0m1_t1.predict(x[test_index, :]) * f_11x[test_index] - ) - - return mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 - - -def _estimate_cross_conditional_mean_outcome_nesting(t, m, x, y, crossfit, - reg_y, reg_cross_y): - """ - Estimate treatment probabilities and the conditional mean outcome, - cross conditional mean outcome - - Estimate the conditional mean outcome, - the cross conditional mean outcome - - Returns - ------- - mu_m0x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M,X] - mu_m1x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M,X] - mu_0x, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=0,X] - E_mu_t0_t1, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=1,X] - E_mu_t1_t0, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=0,X] - mu_1x, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=1,X] - """ - n = len(y) - - # initialisation - ( - mu_1mx, # E[Y|T=1,M,X] - mu_1mx_nested, # E[Y|T=1,M,X] predicted on train_nested set - mu_0mx, # E[Y|T=0,M,X] - mu_0mx_nested, # E[Y|T=0,M,X] predicted on train_nested set - E_mu_t1_t0, # E[E[Y|T=1,M,X]|T=0,X] - E_mu_t0_t1, # E[E[Y|T=0,M,X]|T=1,X] - mu_1x, # E[Y|T=1,X] - mu_0x, # E[Y|T=0,X] - ) = [np.zeros(n) for _ in range(8)] - - xm = np.hstack((x, m)) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - for train, test in train_test_list: - # define test set - train1 = train[t[train] == 1] - train0 = train[t[train] == 0] - - train_mean, train_nested = np.array_split(train, 2) - train_mean1 = train_mean[t[train_mean] == 1] - train_mean0 = train_mean[t[train_mean] == 0] - train_nested1 = train_nested[t[train_nested] == 1] - train_nested0 = train_nested[t[train_nested] == 0] - - # predict E[Y|T=1,M,X] - reg_y1m = clone(reg_y) - reg_y1m.fit(xm[train_mean1], y[train_mean1]) - mu_1mx[test] = reg_y1m.predict(xm[test]) - mu_1mx_nested[train_nested] = reg_y1m.predict(xm[train_nested]) - - # predict E[Y|T=0,M,X] - reg_y0m = clone(reg_y) - reg_y0m.fit(xm[train_mean0], y[train_mean0]) - mu_0mx[test] = reg_y0m.predict(xm[test]) - mu_0mx_nested[train_nested] = reg_y0m.predict(xm[train_nested]) - - # predict E[E[Y|T=1,M,X]|T=0,X] - reg_cross_y1 = clone(reg_cross_y) - reg_cross_y1.fit(x[train_nested0], mu_1mx_nested[train_nested0]) - E_mu_t1_t0[test] = reg_cross_y1.predict(x[test]) - - # predict E[E[Y|T=0,M,X]|T=1,X] - reg_cross_y0 = clone(reg_cross_y) - reg_cross_y0.fit(x[train_nested1], mu_0mx_nested[train_nested1]) - E_mu_t0_t1[test] = reg_cross_y0.predict(x[test]) - - # predict E[Y|T=1,X] - reg_y1 = clone(reg_y) - reg_y1.fit(x[train1], y[train1]) - mu_1x[test] = reg_y1.predict(x[test]) - - # predict E[Y|T=0,X] - reg_y0 = clone(reg_y) - reg_y0.fit(x[train0], y[train0]) - mu_0x[test] = reg_y0.predict(x[test]) - - return mu_0mx, mu_1mx, mu_0x, E_mu_t0_t1, E_mu_t1_t0, mu_1x diff --git a/src/med_bench/utils/utils.py b/src/med_bench/utils/utils.py index ab7f44d..0f2b16c 100644 --- a/src/med_bench/utils/utils.py +++ b/src/med_bench/utils/utils.py @@ -1,98 +1,12 @@ +from numpy.random import default_rng import numpy as np import pandas as pd - import subprocess -import warnings - - -def check_r_dependencies(): - try: - # Check if R is accessible by trying to get its version - subprocess.check_output(["R", "--version"]) - - # If the above command fails, it will raise a subprocess.CalledProcessError and won't reach here - # Assuming reaching here means R is accessible, now try importing rpy2 packages - import rpy2.robjects.packages as rpackages - required_packages = [ - 'causalweight', 'mediation', 'stats', 'base', 'grf', 'plmed' - ] - - for package in required_packages: - rpackages.importr(package) - - return True # All checks passed, R and required packages are available - - except: - # Handle the case where R is not found or rpy2 is not installed - return False +from med_bench.get_simulated_data import simulate_data +from med_bench.utils.constants import ALPHAS, TINY -def is_r_installed(): - try: - subprocess.check_output(["R", "--version"]) - return True - except: - return False - - -def check_r_package(package_name): - try: - import rpy2.robjects.packages as rpackages - rpackages.importr(package_name) - return True - except: - return False - - -class DependencyNotInstalledError(Exception): - pass - - -def r_dependency_required(required_packages): - def decorator(func): - def wrapper(*args, **kwargs): - if not is_r_installed(): - raise DependencyNotInstalledError( - "R is not installed or not found. " - "Please install R and set it up correctly in your system." - ) - - # To get rid of the 'DataFrame' object has no attribute 'iteritems' error due to pandas version mismatch in rpy2 - # https://stackoverflow.com/a/76404841 - pd.DataFrame.iteritems = pd.DataFrame.items - - for package in required_packages: - if not check_r_package(package): - if package != 'plmed': - raise DependencyNotInstalledError( - f"The '{package}' R package is not installed. " - "Please install it using R by running:\n" - "import rpy2.robjects.packages as rpackages\n" - "utils = rpackages.importr('utils')\n" - "utils.chooseCRANmirror(ind=33)\n" - f"utils.install_packages('{package}')" - ) - else: - raise DependencyNotInstalledError( - "The 'plmed' R package is not installed. " - "Please install it using R by running:\n" - "import rpy2.robjects.packages as rpackages\n" - "utils = rpackages.importr('utils')\n" - "utils.chooseCRANmirror(ind=33)\n" - "utils.install_packages('devtools')\n" - "devtools = rpackages.importr('devtools')\n" - "devtools.install_github('ohines/plmed')" - ) - return None - return func(*args, **kwargs) - return wrapper - return decorator - - -if is_r_installed(): - import rpy2.robjects as robjects - def _get_interactions(interaction, *args): """ @@ -137,7 +51,7 @@ def _get_interactions(interaction, *args): return pre_inter_variables new_cols = list() for i, var in enumerate(variables[:]): - for j, var2 in enumerate(variables[i+1:]): + for j, var2 in enumerate(variables[i + 1:]): for ii in range(var.shape[1]): for jj in range(var2.shape[1]): new_cols.append((var[:, ii] * var2[:, jj]).reshape(-1, 1)) @@ -146,15 +60,116 @@ def _get_interactions(interaction, *args): return result -def _convert_array_to_R(x): +class DependencyNotInstalledError(Exception): + pass + + +def _check_input(y, t, m, x, setting): """ - converts a numpy array to a R matrix or vector - >>> a = np.array([[1, 2, 3], [4, 5, 6]]) - >>> np.sum(a == np.array(_convert_array_to_R(a))) - 6 + internal function to check inputs. `_check_input` adjusts the dimension + of the input (matrix or vectors), and raises an error + - if the size of input is not adequate, + - or if the type of input is not supported (cotinuous treatment or + non-binary one-dimensional mediator if the specified setting parameter + is binary) + + Parameters + ---------- + y : array-like, shape (n_samples) + Outcome value for each unit, continuous + + t : array-like, shape (n_samples) + Treatment value for each unit, binary + + m : array-like, shape (n_samples, n_mediators) + Mediator value for each unit, binary and unidimensional + + x : array-like, shape (n_samples, n_features_covariates) + Covariates value for each unit, continuous + + setting : string + ('binary', 'continuous', 'multidimensional') value for the mediator + + Returns + ------- + y_converted : array-like, shape (n_samples,) + Outcome value for each unit, continuous + + t_converted : array-like, shape (n_samples,) + Treatment value for each unit, binary + + m_converted : array-like, shape (n_samples, n_mediators) + Mediator value for each unit, binary and unidimensional + + x_converted : array-like, shape (n_samples, n_features_covariates) + Covariates value for each unit, continuous """ + # check format + if len(y) != len(y.ravel()): + raise ValueError("Multidimensional y (outcome) is not supported") + + if len(t) != len(t.ravel()): + raise ValueError("Multidimensional t (exposure) is not supported") + + if len(np.unique(t)) != 2: + raise ValueError("Only a binary t (exposure) is supported") + + n = len(y) + t_converted = t.ravel() + y_converted = y.ravel() + + if n != len(x) or n != len(m) or n != len(t): + raise ValueError("Inputs don't have the same number of observations") + if len(x.shape) == 1: - return robjects.FloatVector(x) - elif len(x.shape) == 2: - return robjects.r.matrix(robjects.FloatVector(x.ravel()), - nrow=x.shape[0], byrow='TRUE') + x_converted = x.reshape(n, 1) + else: + x_converted = x + + if len(m.shape) == 1: + m_converted = m.reshape(n, 1) + else: + m_converted = m + + if (m_converted.shape[1] > 1) and (setting != "multidimensional"): + raise ValueError("Multidimensional m (mediator) is not supported") + + if (setting == "binary") and (len(np.unique(m)) != 2): + raise ValueError( + "Only a binary one-dimensional m (mediator) is supported") + + return y_converted, t_converted, m_converted, x_converted + + +def is_array_integer(array): + if array.shape[1] > 1: + return False + return all(list((array == array.astype(int)).squeeze())) + + +def is_array_binary(array): + if len(np.unique(array)) == 2: + return True + return False + + +def _get_regularization_parameters(regularization): + """ + Obtain regularization parameters + + Returns + ------- + cs : list + each of the values in Cs describes the inverse of regularization + strength for predictors + alphas : list + alpha values to try in ridge models + """ + if regularization: + alphas = ALPHAS + cs = ALPHAS + else: + alphas = [TINY] + cs = [np.inf] + + return cs, alphas diff --git a/src/tests/estimation/generate_tests_results.py b/src/tests/estimation/generate_tests_results.py deleted file mode 100644 index b698a05..0000000 --- a/src/tests/estimation/generate_tests_results.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np - -from med_bench.get_simulated_data import simulate_data -from med_bench.get_estimation import get_estimation - -from med_bench.utils.constants import ESTIMATORS, PARAMETER_LIST, PARAMETER_NAME - - -def _get_data_from_list(data): - """Get x, t, m, y from simulated data - """ - x = data[0] - t = data[1].ravel() - m = data[2] - y = data[3].ravel() - - return x, t, m, y - - -def _get_config_from_dict(dict_params): - """Get config parameter used for estimators parametrisation - """ - if dict_params["dim_m"] == 1 and dict_params["type_m"] == "binary": - config = 0 - else: - config = 5 - return config - - -def _get_estimators_results(x, t, m, y, config, estimator): - """Get estimation result from specified input parameters and estimator name - """ - - try: - res = get_estimation(x, t, m, y, estimator, config)[0:5] - return res - - except Exception as e: - print(f"{e}") - return str(e) - - -if __name__ == "__main__": - - results = [] - - for param_list in PARAMETER_LIST: - - # Get synthetic input data from parameters list defined above - dict_params = dict(zip(PARAMETER_NAME, param_list)) - data = simulate_data(**dict_params) - x, t, m, y = _get_data_from_list(data) - config = _get_config_from_dict(dict_params=dict_params) - - for estimator in ESTIMATORS: - - # Get results from synthetic inputs - result = _get_estimators_results(x, t, m, y, config, estimator) - row = [estimator, x, t, m, y, config, result] - results.append(row) - - # Store the results in a npy file - np.save("tests_results.npy", np.array(results, dtype="object")) diff --git a/src/tests/estimation/get_estimation_results.py b/src/tests/estimation/get_estimation_results.py new file mode 100644 index 0000000..26b21c3 --- /dev/null +++ b/src/tests/estimation/get_estimation_results.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import numpy as np + +from med_bench.estimation.mediation_coefficient_product import CoefficientProduct +from med_bench.estimation.mediation_dml import DoubleMachineLearning +from med_bench.estimation.mediation_g_computation import GComputation +from med_bench.estimation.mediation_ipw import InversePropensityWeighting +from med_bench.estimation.mediation_mr import MultiplyRobust +from med_bench.estimation.mediation_tmle import TMLE +from med_bench.utils.utils import _get_regularization_parameters +from med_bench.utils.constants import CV_FOLDS + +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LogisticRegressionCV, RidgeCV +from sklearn.calibration import CalibratedClassifierCV + + +def _transform_outputs(causal_effects): + """Transforms outputs in the old format + + Args: + causal_effects (dict): dictionary of causal effects + + Returns: + list: list of causal effects + """ + total = causal_effects["total_effect"] + direct_treated = causal_effects["direct_effect_treated"] + direct_control = causal_effects["direct_effect_control"] + indirect_treated = causal_effects["indirect_effect_treated"] + indirect_control = causal_effects["indirect_effect_control"] + return np.array([total, + direct_treated, + direct_control, + indirect_treated, + indirect_control]).astype(float) + + +def _get_estimation_results(x, t, m, y, estimator): + """Dynamically selects and calls an estimator (class-based or legacy function) to estimate total, direct, and indirect effects.""" + + effects = None # Initialize variable to store the effects + + # Helper function for regularized regressor and classifier initialization + def _get_regularized_regressor_and_classifier( + regularize=True, calibration=None, method="sigmoid" + ): + cs, alphas = _get_regularization_parameters(regularization=regularize) + clf = LogisticRegressionCV(random_state=42, Cs=cs, cv=CV_FOLDS) + reg = RidgeCV(alphas=alphas, cv=CV_FOLDS) + if calibration: + clf = CalibratedClassifierCV(clf, method=method) + return clf, reg + + if estimator == "coefficient_product": + # Class-based implementation for CoefficientProduct + estimator_obj = CoefficientProduct(regularize=True) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + elif estimator == "mediation_ipw_reg": + # Class-based implementation with regularization + clf, reg = _get_regularized_regressor_and_classifier(regularize=True) + estimator_obj = InversePropensityWeighting(clip=1e-6, trim=0, classifier=clf) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + elif estimator == "mediation_ipw_reg_calibration": + # Class-based implementation with regularization and calibration (sigmoid) + clf, reg = _get_regularized_regressor_and_classifier( + regularize=True, calibration=True, method="sigmoid" + ) + estimator_obj = InversePropensityWeighting(clip=1e-6, trim=0, classifier=clf) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + elif estimator == "mediation_g_computation_reg": + # Class-based implementation of GComputation with regularization + clf, reg = _get_regularized_regressor_and_classifier(regularize=True) + estimator_obj = GComputation(regressor=reg, classifier=clf) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + elif estimator == "mediation_g_computation_reg_calibration": + # Class-based implementation with regularization and calibrated sigmoid + clf, reg = _get_regularized_regressor_and_classifier( + regularize=True, calibration=True, method="sigmoid" + ) + estimator_obj = GComputation(regressor=reg, classifier=clf) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + # Regularized MultiplyRobust estimator + elif estimator == "mediation_multiply_robust_reg": + clf, reg = _get_regularized_regressor_and_classifier(regularize=True) + estimator_obj = MultiplyRobust( + ratio="propensities", normalized=True, regressor=reg, classifier=clf + ) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + # Regularized MultiplyRobust with sigmoid calibration + elif estimator == "mediation_multiply_robust_reg_calibration": + clf, reg = _get_regularized_regressor_and_classifier( + regularize=True, calibration=True, method="sigmoid" + ) + estimator_obj = MultiplyRobust( + ratio="propensities", normalized=True, regressor=reg, classifier=clf + ) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + # Regularized Double Machine Learning + elif estimator == "mediation_dml_reg": + clf, reg = _get_regularized_regressor_and_classifier(regularize=True) + estimator_obj = DoubleMachineLearning( + normalized=True, regressor=reg, classifier=clf + ) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + # Regularized Double Machine Learning + elif estimator == "mediation_dml_reg_calibration": + clf, reg = _get_regularized_regressor_and_classifier(regularize=True) + calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid") + estimator_obj = DoubleMachineLearning( + normalized=True, regressor=reg, classifier=calibrated_clf + ) + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + # TMLE - ratio of propensities + elif estimator == "mediation_tmle_propensities": + clf, reg = _get_regularized_regressor_and_classifier(regularize=True) + estimator_obj = TMLE(regressor=reg, classifier=clf, ratio="propensities") + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + # TMLE - ratio of propensities + elif estimator == "mediation_tmle_density": + clf, reg = _get_regularized_regressor_and_classifier(regularize=True) + estimator_obj = TMLE(regressor=reg, classifier=clf, ratio="density") + estimator_obj.fit(t, m, x, y) + causal_effects = estimator_obj.estimate(t, m, x, y) + effects = _transform_outputs(causal_effects) + + else: + raise ValueError("Unrecognized estimator label for {}.".format(estimator)) + + # Catch unsupported estimators and raise an error + if effects is None: + raise ValueError( + f"Estimation failed for {estimator}. Check inputs or configuration." + ) + return effects diff --git a/src/tests/estimation/test_exact_estimation.py b/src/tests/estimation/test_exact_estimation.py deleted file mode 100644 index 98d2d96..0000000 --- a/src/tests/estimation/test_exact_estimation.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -Pytest file for get_estimation.py - -It tests all the benchmark_mediation estimators : -- for a certain tolerance -- whether their effects satisfy "total = direct + indirect" -- whether they support (n,1) and (n,) inputs - -To be robust to future updates, tests are adjusted with a smaller tolerance when possible. -The test is skipped if estimator has not been implemented yet, i.e. if ValueError is raised. -The test fails for any other unwanted behavior. -""" - -from pprint import pprint -import pytest -import os -import numpy as np - -from med_bench.get_estimation import get_estimation -from med_bench.utils.constants import R_DEPENDENT_ESTIMATORS -from med_bench.utils.utils import DependencyNotInstalledError, check_r_dependencies - -current_dir = os.path.dirname(__file__) -true_estimations_file_path = os.path.join(current_dir, 'tests_results.npy') -TRUE_ESTIMATIONS = np.load(true_estimations_file_path, allow_pickle=True) - - -@pytest.fixture(params=range(TRUE_ESTIMATIONS.shape[0])) -def tests_results_idx(request): - return request.param - - -@pytest.fixture -def data(tests_results_idx): - return TRUE_ESTIMATIONS[tests_results_idx] - - -@pytest.fixture -def estimator(data): - return data[0] - - -@pytest.fixture -def x(data): - return data[1] - - -# t is raveled because some estimators fail with (n,1) inputs -@pytest.fixture -def t(data): - return data[2].ravel() - - -@pytest.fixture -def m(data): - return data[3] - - -@pytest.fixture -def y(data): - return data[4].ravel() # same reason as t - - -@pytest.fixture -def config(data): - return data[5] - - -@pytest.fixture -def result(data): - return data[6] - - -@pytest.fixture -def effects_chap(x, t, m, y, estimator, config): - # try whether estimator is implemented or not - - try: - res = get_estimation(x, t, m, y, estimator, config)[0:5] - - # NaN situations - if np.all(np.isnan(res)): - pytest.xfail("all effects are NaN") - elif np.any(np.isnan(res)): - pprint("NaN found") - - except Exception as e: - if str(e) in ( - "Estimator only supports 1D binary mediator.", - "Estimator does not support 1D binary mediator.", - ): - pytest.skip(f"{e}") - - # We skip the test if an error with function from glmet rpy2 package occurs - elif "glmnet::glmnet" in str(e): - pytest.skip(f"{e}") - - elif estimator in R_DEPENDENT_ESTIMATORS and not check_r_dependencies(): - assert isinstance(e, DependencyNotInstalledError) == True - pytest.skip(f"{e}") - - else: - pytest.fail(f"{e}") - - return res - - -def test_estimation_exactness(result, effects_chap): - assert np.all(effects_chap == pytest.approx(result, abs=0.01)) diff --git a/src/tests/estimation/test_get_estimation.py b/src/tests/estimation/test_get_estimation.py index 2e1adae..f93d014 100644 --- a/src/tests/estimation/test_get_estimation.py +++ b/src/tests/estimation/test_get_estimation.py @@ -14,86 +14,85 @@ from pprint import pprint import pytest import numpy as np +import os +from tests.estimation.get_estimation_results import _get_estimation_results from med_bench.get_simulated_data import simulate_data -from med_bench.get_estimation import get_estimation -from med_bench.utils.utils import DependencyNotInstalledError, check_r_dependencies -from med_bench.utils.constants import PARAMETER_LIST, PARAMETER_NAME, R_DEPENDENT_ESTIMATORS, TOLERANCE_DICT +from med_bench.utils.utils import DependencyNotInstalledError +from med_bench.utils.constants import ( + CONFIGURATION_NAMES, + CONFIG_DICT, + DEFAULT_TOLERANCE, + TOLERANCE_FACTOR_DICT, + ESTIMATORS, +) -@pytest.fixture(params=PARAMETER_LIST) -def dict_param(request): - return dict(zip(PARAMETER_NAME, request.param)) +@pytest.fixture(params=CONFIGURATION_NAMES) +def configuration_name(request): + return request.param + + +@pytest.fixture +def dict_param(configuration_name): + return CONFIG_DICT[configuration_name] +# Two distinct data fixtures @pytest.fixture -def data(dict_param): +def data_simulated(dict_param): return simulate_data(**dict_param) @pytest.fixture -def x(data): - return data[0] +def x(data_simulated): + return data_simulated[0] # t is raveled because some estimators fail with (n,1) inputs @pytest.fixture -def t(data): - return data[1].ravel() +def t(data_simulated): + return data_simulated[1].ravel() @pytest.fixture -def m(data): - return data[2] +def m(data_simulated): + return data_simulated[2] @pytest.fixture -def y(data): - return data[3].ravel() # same reason as t +def y(data_simulated): + return data_simulated[3].ravel() # same reason as t @pytest.fixture -def effects(data): - return np.array(data[4:9]) +def effects(data_simulated): + return np.array(data_simulated[4:9]) -@pytest.fixture(params=list(TOLERANCE_DICT.keys())) +@pytest.fixture(params=ESTIMATORS) def estimator(request): return request.param @pytest.fixture -def tolerance(estimator): - return TOLERANCE_DICT[estimator] - - -@pytest.fixture -def config(dict_param): - if dict_param["dim_m"] == 1 and dict_param["type_m"] == "binary": - return 0 - return 5 +def tolerance(estimator, configuration_name): + test_name = "{}-{}".format(estimator, configuration_name) + if test_name in TOLERANCE_FACTOR_DICT.keys(): + tolerance = DEFAULT_TOLERANCE * TOLERANCE_FACTOR_DICT[test_name] + else: + tolerance = DEFAULT_TOLERANCE + return tolerance @pytest.fixture -def effects_chap(x, t, m, y, estimator, config): +def effects_chap(x, t, m, y, estimator): # try whether estimator is implemented or not - try: - res = get_estimation(x, t, m, y, estimator, config)[0:5] + res = _get_estimation_results(x, t, m, y, estimator) except Exception as e: - if str(e) in ( - "Estimator only supports 1D binary mediator.", - "Estimator does not support 1D binary mediator.", - ): - pytest.skip(f"{e}") - - # We skip the test if an error with function from glmet rpy2 package occurs - elif "glmnet::glmnet" in str(e): - pytest.skip(f"{e}") - - elif estimator in R_DEPENDENT_ESTIMATORS and not check_r_dependencies(): - assert isinstance(e, DependencyNotInstalledError) == True + if "1D binary mediator" in str(e): pytest.skip(f"{e}") else: @@ -115,20 +114,22 @@ def test_tolerance(effects, effects_chap, tolerance): def test_total_is_direct_plus_indirect(effects_chap): if not np.isnan(effects_chap[1]): - assert effects_chap[0] == pytest.approx( - effects_chap[1] + effects_chap[4]) + assert effects_chap[0] == pytest.approx(effects_chap[1] + effects_chap[4]) if not np.isnan(effects_chap[2]): - assert effects_chap[0] == pytest.approx( - effects_chap[2] + effects_chap[3]) + assert effects_chap[0] == pytest.approx(effects_chap[2] + effects_chap[3]) -@pytest.mark.xfail -def test_robustness_to_ravel_format(data, estimator, config, effects_chap): +def test_robustness_to_ravel_format(data_simulated, estimator, effects_chap): if "forest" in estimator: pytest.skip("Forest estimator skipped") assert np.all( - get_estimation(data[0], data[1], data[2], - data[3], estimator, config)[0:5] + _get_estimation_results( + data_simulated[0], + data_simulated[1], + data_simulated[2], + data_simulated[3], + estimator, + ) == pytest.approx( effects_chap, nan_ok=True ) # effects_chap is obtained with data[1].ravel() and data[3].ravel() diff --git a/src/tests/estimation/tests_results.npy b/src/tests/estimation/tests_results.npy deleted file mode 100644 index b468ed3..0000000 Binary files a/src/tests/estimation/tests_results.npy and /dev/null differ