|
| 1 | +""" |
| 2 | +=================================================== |
| 3 | +Failure of Machine Learning to infer causal effects |
| 4 | +=================================================== |
| 5 | +
|
| 6 | +Machine Learning models are great for measuring statistical associations. |
| 7 | +Unfortunately, unless we're willing to make strong assumptions about the data, |
| 8 | +those models are unable to infer causal effects. |
| 9 | +
|
| 10 | +To illustrate this, we will simulate a situation in which we try to answer one |
| 11 | +of the most important questions in economics of education: **what is the causal |
| 12 | +effect of earning a college degree on hourly wages?** Although the answer to |
| 13 | +this question is crucial to policy makers, `Omitted-Variable Biases |
| 14 | +<https://en.wikipedia.org/wiki/Omitted-variable_bias>`_ (OVB) prevent us from |
| 15 | +identifying that causal effect. |
| 16 | +""" |
| 17 | + |
| 18 | +# %% |
| 19 | +# The dataset: simulated hourly wages |
| 20 | +# ----------------------------------- |
| 21 | +# |
| 22 | +# The data generating process is laid out in the code below. Work experience in |
| 23 | +# years and a measure of ability are drawn from Normal distributions; the |
| 24 | +# hourly wage of one of the parents is drawn from Beta distribution. We then |
| 25 | +# create an indicator of college degree which is positively impacted by ability |
| 26 | +# and parental hourly wage. Finally, we model hourly wages as a linear function |
| 27 | +# of all the previous variables and a random component. Note that all variables |
| 28 | +# have a positive effect on hourly wages. |
| 29 | +import numpy as np |
| 30 | +import pandas as pd |
| 31 | + |
| 32 | +n_samples = 10_000 |
| 33 | +rng = np.random.RandomState(32) |
| 34 | + |
| 35 | +experiences = rng.normal(20, 10, size=n_samples).astype(int) |
| 36 | +experiences[experiences < 0] = 0 |
| 37 | +abilities = rng.normal(0, 0.15, size=n_samples) |
| 38 | +parent_hourly_wages = 50 * rng.beta(2, 8, size=n_samples) |
| 39 | +parent_hourly_wages[parent_hourly_wages < 0] = 0 |
| 40 | +college_degrees = ( |
| 41 | + 9 * abilities + 0.02 * parent_hourly_wages + rng.randn(n_samples) > 0.7 |
| 42 | +).astype(int) |
| 43 | + |
| 44 | +true_coef = pd.Series( |
| 45 | + { |
| 46 | + "college degree": 2.0, |
| 47 | + "ability": 5.0, |
| 48 | + "experience": 0.2, |
| 49 | + "parent hourly wage": 1.0, |
| 50 | + } |
| 51 | +) |
| 52 | +hourly_wages = ( |
| 53 | + true_coef["experience"] * experiences |
| 54 | + + true_coef["parent hourly wage"] * parent_hourly_wages |
| 55 | + + true_coef["college degree"] * college_degrees |
| 56 | + + true_coef["ability"] * abilities |
| 57 | + + rng.normal(0, 1, size=n_samples) |
| 58 | +) |
| 59 | + |
| 60 | +hourly_wages[hourly_wages < 0] = 0 |
| 61 | + |
| 62 | +# %% |
| 63 | +# Description of the simulated data |
| 64 | +# --------------------------------- |
| 65 | +# |
| 66 | +# The following plot shows the distribution of each variable, and pairwise |
| 67 | +# scatter plots. Key to our OVB story is the positive relationship between |
| 68 | +# ability and college degree. |
| 69 | +import seaborn as sns |
| 70 | + |
| 71 | +df = pd.DataFrame( |
| 72 | + { |
| 73 | + "college degree": college_degrees, |
| 74 | + "ability": abilities, |
| 75 | + "hourly wage": hourly_wages, |
| 76 | + "experience": experiences, |
| 77 | + "parent hourly wage": parent_hourly_wages, |
| 78 | + } |
| 79 | +) |
| 80 | + |
| 81 | +grid = sns.pairplot(df, diag_kind="kde", corner=True) |
| 82 | + |
| 83 | +# %% |
| 84 | +# In the next section, we train predictive models and we therefore split the |
| 85 | +# target column from over features and we split the data into a training and a |
| 86 | +# testing set. |
| 87 | +from sklearn.model_selection import train_test_split |
| 88 | + |
| 89 | +target_name = "hourly wage" |
| 90 | +X, y = df.drop(columns=target_name), df[target_name] |
| 91 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) |
| 92 | + |
| 93 | +# %% |
| 94 | +# Income prediction with fully observed variables |
| 95 | +# ----------------------------------------------- |
| 96 | +# |
| 97 | +# First, we train a predictive model, a |
| 98 | +# :class:`~sklearn.linear_model.LinearRegression` model. In this experiment, |
| 99 | +# we assume that all variables used by the true generative model are available. |
| 100 | +from sklearn.linear_model import LinearRegression |
| 101 | +from sklearn.metrics import r2_score |
| 102 | + |
| 103 | +features_names = ["experience", "parent hourly wage", "college degree", "ability"] |
| 104 | + |
| 105 | +regressor_with_ability = LinearRegression() |
| 106 | +regressor_with_ability.fit(X_train[features_names], y_train) |
| 107 | +y_pred_with_ability = regressor_with_ability.predict(X_test[features_names]) |
| 108 | +R2_with_ability = r2_score(y_test, y_pred_with_ability) |
| 109 | + |
| 110 | +print(f"R2 score with ability: {R2_with_ability:.3f}") |
| 111 | + |
| 112 | +# %% |
| 113 | +# This model predicts well the hourly wages as shown by the high R2 score. We |
| 114 | +# plot the model coefficients to show that we exactly recover the values of |
| 115 | +# the true generative model. |
| 116 | +import matplotlib.pyplot as plt |
| 117 | + |
| 118 | +model_coef = pd.Series(regressor_with_ability.coef_, index=features_names) |
| 119 | +coef = pd.concat( |
| 120 | + [true_coef[features_names], model_coef], |
| 121 | + keys=["Coefficients of true generative model", "Model coefficients"], |
| 122 | + axis=1, |
| 123 | +) |
| 124 | +ax = coef.plot.barh() |
| 125 | +ax.set_xlabel("Coefficient values") |
| 126 | +ax.set_title("Coefficients of the linear regression including the ability features") |
| 127 | +plt.tight_layout() |
| 128 | +plt.show() |
| 129 | + |
| 130 | +# %% |
| 131 | +# Income prediction with partial observations |
| 132 | +# ------------------------------------------- |
| 133 | +# |
| 134 | +# In practice, intellectual abilities are not observed or are only estimated |
| 135 | +# from proxies that inadvertently measure education as well (e.g. by IQ tests). |
| 136 | +# But omitting the "ability" feature from a linear model inflates the estimate |
| 137 | +# via a positive OVB. |
| 138 | +features_names = ["experience", "parent hourly wage", "college degree"] |
| 139 | + |
| 140 | +regressor_without_ability = LinearRegression() |
| 141 | +regressor_without_ability.fit(X_train[features_names], y_train) |
| 142 | +y_pred_without_ability = regressor_without_ability.predict(X_test[features_names]) |
| 143 | +R2_without_ability = r2_score(y_test, y_pred_without_ability) |
| 144 | + |
| 145 | +print(f"R2 score without ability: {R2_without_ability:.3f}") |
| 146 | + |
| 147 | +# %% |
| 148 | +# The predictive power of our model is similar when we omit the ability feature |
| 149 | +# in terms of R2 score. We now check if the coefficient of the model are |
| 150 | +# different from the true generative model. |
| 151 | + |
| 152 | +model_coef = pd.Series(regressor_without_ability.coef_, index=features_names) |
| 153 | +coef = pd.concat( |
| 154 | + [true_coef[features_names], model_coef], |
| 155 | + keys=["Coefficients of true generative model", "Model coefficients"], |
| 156 | + axis=1, |
| 157 | +) |
| 158 | +ax = coef.plot.barh() |
| 159 | +ax.set_xlabel("Coefficient values") |
| 160 | +_ = ax.set_title("Coefficients of the linear regression excluding the ability feature") |
| 161 | + |
| 162 | +# %% |
| 163 | +# To compensate for the omitted variable, the model inflates the coefficient of |
| 164 | +# the college degree feature. Therefore, interpreting this coefficient value |
| 165 | +# as a causal effect of the true generative model is incorrect. |
| 166 | +# |
| 167 | +# Lessons learned |
| 168 | +# --------------- |
| 169 | +# |
| 170 | +# Machine learning models are not designed for the estimation of causal |
| 171 | +# effects. While we showed this with a linear model, OVB can affect any type of |
| 172 | +# model. |
| 173 | +# |
| 174 | +# Whenever interpreting a coefficient or a change in predictions brought about |
| 175 | +# by a change in one of the features, it is important to keep in mind |
| 176 | +# potentially unobserved variables that could be correlated with both the |
| 177 | +# feature in question and the target variable. Such variables are called |
| 178 | +# `Confounding Variables <https://en.wikipedia.org/wiki/Confounding>`_. In |
| 179 | +# order to still estimate causal effect in the presence of confounding, |
| 180 | +# researchers usually conduct experiments in which the treatment variable (e.g. |
| 181 | +# college degree) is randomized. When an experiment is prohibitively expensive |
| 182 | +# or unethical, researchers can sometimes use other causal inference techniques |
| 183 | +# such as `Instrumental Variables |
| 184 | +# <https://en.wikipedia.org/wiki/Instrumental_variables_estimation>`_ (IV) |
| 185 | +# estimations. |
0 commit comments