因果图(Causal Graph)#
import networkx as nx
# import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
np.random.seed(1000)
Collider (对撞结构)#
Suppose z is collider, x influence z, and y influence z, no relationship between x and y
draw_options = {
"arrowsize": 20,
"font_size": 20,
"node_size": 1000,
"node_color": "white",
"edgecolors": "black",
"linewidths": 1,
"width": 1,
"with_labels": True,
}
# 构建有向图并添加结点和边
Collider = nx.DiGraph()
Collider.add_nodes_from(list('xyz'))
Collider.add_edges_from([('x','z'),('y','z')])
pos = {'x': (-0.3,0), 'z': (0,0.2), 'y': (0.3,0)}
nx.draw(Collider, **draw_options,pos =pos)
# generate x, y
x = 10+50*np.random.randn(100000)
y = -20 + 30*np.random.rand(100000)
print(np.corrcoef(x,y)[0,1])
# generate z
z = x + y + 1.5*np.random.randn(100000)
0.00035713020604406597
fit model
(1) \(y_i = \alpha + \beta x_i +\epsilon_i\)
(2) \(y_i = \alpha + \beta x_i + \gamma z_i + \epsilon_i\)
cons = np.array([1]*100000)
fit = sm.OLS(endog = y, exog = np.array([cons,x]).T).fit()
# print(fit.summary())
fit2 = sm.OLS(endog = y, exog = np.array([cons,x,z]).T).fit()
#print(fit2.summary())
# Presenting result
result_table = summary_col(results = [fit, fit2],
float_format = '%.4f',
model_names = ['Model 1', 'Model 2'],
stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
Model 1 v.s. Model 2
====================================
Model 1 Model 2
------------------------------------
R-squared 0.0000 0.9709
R-squared Adj. -0.0000 0.9709
const -4.9667*** -0.1411***
(0.0279) (0.0054)
x1 0.0001 -0.9700***
(0.0005) (0.0005)
x2 0.9702***
(0.0005)
====================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Fork#
混杂变量,confounder
# generate z,x, y
z = 200 + np.random.randn(100000)
x = 10+50*z + np.random.randn(100000)
y = -20 + 30*z + 3*np.random.rand(100000)
print(np.corrcoef(x,y)[0,1])
# # generate z
# z = x + y + 1.5*np.random.randn(100000)
cons = np.array([1]*100000)
fit = sm.OLS(endog = y, exog = np.array([cons,x]).T).fit()
# print(fit.summary())
fit2 = sm.OLS(endog = y, exog = np.array([cons,x,z]).T).fit()
#print(fit2.summary())
# Presenting result
result_table = summary_col(results = [fit, fit2],
float_format = '%.4f',
model_names = ['Model 1', 'Model 2'],
stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
0.9993873947603757
Model 1 v.s. Model 2
======================================
Model 1 Model 2
--------------------------------------
R-squared 0.9988 0.9992
R-squared Adj. 0.9988 0.9992
const -20.1788*** -17.1419***
(0.6646) (0.5472)
x1 0.5996*** 0.0016
(0.0001) (0.0027)
x2 29.9135***
(0.1371)
======================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
draw_options = {
"arrowsize": 20,
"font_size": 20,
"node_size": 1000,
"node_color": "white",
"edgecolors": "black",
"linewidths": 1,
"width": 1,
"with_labels": True,
}
# 构建有向图并添加结点和边
Collider = nx.DiGraph()
Collider.add_nodes_from(list('xyz'))
Collider.add_edges_from([('z','x'),('z','y')])
pos = {'x': (-0.3,0), 'z': (0,0.2), 'y': (0.3,0)}
nx.draw(Collider, **draw_options,pos =pos)
draw_options = {
"arrowsize": 20,
"font_size": 20,
"node_size": 1000,
"node_color": "white",
"edgecolors": "black",
"linewidths": 1,
"width": 1,
"with_labels": True,
}
# 构建有向图并添加结点和边
G = nx.DiGraph()
G.add_nodes_from(list('ABCPQ'))
G.add_edges_from([('P','Q'),('A','P'),('A','Q')])
G.add_edges_from([('P','C'),('C','Q')])
G.add_edges_from([('B','P')])
pos = {'B': (-0.6,0), 'P': (0,0), 'Q': (0.7,0),
'A':(0.35, -0.1), 'C':(0.35,0.1)}
nx.draw(G, **draw_options,pos =pos)
data_size = 100000
b = 100 + 10*np.random.randn(data_size)
a = 60 + 30*np.random.randn(data_size)
p = 10 + 2*b + 0.8*a + np.random.randn(data_size)
c = 2 + p + np.random.randn(data_size)
q = 400 - p + 0.2*c + 2*a + np.random.randn(data_size)
# total price effect
# q = 400.4 - 0.8*p + 2*a + noise
fit model
(1) \(q_i = \alpha + \beta p_i +\epsilon_i\)
(2) \(q_i = \alpha + \beta p_i + \gamma a_i + \epsilon_i\)
cons = np.array([1]*data_size)
fit = sm.OLS(endog = q, exog = np.array([cons,p]).T).fit()
# print(fit.summary())
fit2 = sm.OLS(endog = q, exog = np.array([cons,p, a]).T).fit()
#print(fit2.summary())
# Presenting result
result_table = summary_col(results = [fit, fit2],
float_format = '%.4f',
model_names = ['Model 1', 'Model 2'],
stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
Model 1 v.s. Model 2
======================================
Model 1 Model 2
--------------------------------------
R-squared 0.2352 0.9995
R-squared Adj. 0.2352 0.9995
const 138.1122*** 400.4521***
(1.0109) (0.0345)
x1 0.6819*** -0.8002***
(0.0039) (0.0002)
x2 2.0001***
(0.0002)
======================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
# OLS
# ols_fit = sm.OLS(endog = q, exog = np.array([cons,p]).T).fit()
# 2SLS
iv_fit_1stage = sm.OLS(endog = p, exog = np.array([cons,b]).T).fit()
x_hat = iv_fit_1stage.predict()
iv_fit_2stage = sm.OLS(endog = q, exog = np.array([cons,x_hat]).T).fit()
# control confounder
# ols_confounder = sm.OLS(data['y'], data[['x','z']]).fit()
# presenting result
# from statsmodels.iolib.summary2 import summary_col
result_table = summary_col(results = [fit, fit2, iv_fit_2stage],
float_format = '%.4f',
model_names = ['Model1', 'Model 2', 'Model 1 with IV'],
stars = True)
result_table.add_title('IV')
print(result_table)
# print(ols_fit.summary())
# print(iv_fit_2stage.summary())
IV
======================================================
Model1 Model 2 Model 1 with IV
------------------------------------------------------
R-squared 0.2352 0.9995 0.1324
R-squared Adj. 0.2352 0.9995 0.1324
const 138.1122*** 400.4521*** 521.4658***
(1.0109) (0.0345) (1.6836)
x1 0.6819*** -0.8002*** -0.8037***
(0.0039) (0.0002) (0.0065)
x2 2.0001***
(0.0002)
======================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Chain#
draw_options = {
"arrowsize": 20,
"font_size": 20,
"node_size": 1000,
"node_color": "white",
"edgecolors": "black",
"linewidths": 1,
"width": 1,
"with_labels": True,
}
# 构建有向图并添加结点和边
chain = nx.DiGraph()
chain.add_nodes_from(list('xyz'))
chain.add_edges_from([('x','z'),('z','y')])
pos = {'x': (-0.3,0), 'z': (0,0), 'y': (0.3,0)}
nx.draw(chain, **draw_options,pos =pos)
data_size = 100000
x = 100 + 10*np.random.randn(data_size)
z = 3*x+ np.random.randn(data_size)
y = 5*z+ np.random.randn(data_size)
fit model
(1) \(y_i = \alpha + \beta x_i +\epsilon_i\)
(2) \(y_i = \alpha + \beta x_i + \gamma z_i + \epsilon_i\)
cons = np.array([1]*data_size)
fit = sm.OLS(endog = y, exog = np.array([cons,x]).T).fit()
# print(fit.summary())
fit2 = sm.OLS(endog = y, exog = np.array([cons,x, z]).T).fit()
#print(fit2.summary())
# Presenting result
result_table = summary_col(results = [fit, fit2],
float_format = '%.4f',
model_names = ['Model 1', 'Model 2'],
stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
Model 1 v.s. Model 2
===================================
Model 1 Model 2
-----------------------------------
R-squared 0.9988 1.0000
R-squared Adj. 0.9988 1.0000
const -0.4012** -0.0434
(0.1620) (0.0316)
x1 15.0039*** -0.0024
(0.0016) (0.0094)
x2 5.0010***
(0.0031)
===================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Simulation Practice: Quantifying causality in data science with quasi-experiments#
IV simu#
causal graph:
np.random.seed(2021)
z = np.random.randn()
1.1 Simu data#
import random
random.seed(2021) #设定种子
# generate IV , X , Z and Y
iv = np.array([random.gauss(10,10) for i in range(5000)])
z = np.array([random.randrange(-30,30) for i in range(5000)])
x = 3*iv + 2*z+ np.array([random.gauss(0,1) for i in range(5000)])
y = 4*x + 6*z + np.array([random.gauss(0,1) for i in range(5000)])
print('IV first 20 data: ')
print(iv[0:20])
print('-'*20)
print('y first 20 data: ')
print(y[0:20])
print('-'*20)
IV first 20 data:
[ 20.20586392 -6.93081101 2.74935374 7.9373736 -7.82976154
-10.22917018 6.41968456 10.59498401 8.58104578 21.26374151
15.07168142 6.92463389 17.90393551 -4.47388035 4.1944725
-1.18214967 27.76604789 15.89608828 3.46856887 11.36222159]
--------------------
y first 20 data:
[-186.54764132 89.24934793 -115.40383367 -17.07299347 -192.41523531
-208.01767037 -220.04205714 259.29547667 398.37124418 213.11254403
192.87980195 486.78183573 21.63838191 261.54467099 -160.23441493
50.70465479 292.82361655 485.63725094 195.58686214 34.25125174]
--------------------
import pandas as pd
df = pd.DataFrame([y,x,z,iv])
data = pd.DataFrame(df.values.T,columns = ['y','x','z','iv'])
1.2 Regression#
import statsmodels.api as sm
# OLS
ols_fit = sm.OLS(endog = y, exog=x).fit()
# 2SLS
iv_fit_1stage = sm.OLS(endog = x, exog = iv).fit()
x_hat = iv_fit_1stage.predict()
iv_fit_2stage = sm.OLS(endog = y, exog = x_hat).fit()
# control confounder
ols_confounder = sm.OLS(data['y'], data[['x','z']]).fit()
# presenting result
from statsmodels.iolib.summary2 import summary_col
result_table = summary_col(results = [ols_fit, iv_fit_2stage, ols_confounder],
float_format = '%.4f',
model_names = ['OLS', '2SLS','control confounder'],
stars = True)
result_table.add_title('OLS v.s. 2SLS')
print(result_table)
# print(ols_fit.summary())
# print(iv_fit_2stage.summary())
OLS v.s. 2SLS
=====================================================
OLS 2SLS control confounder
-----------------------------------------------------
R-squared 0.9231 0.3286 1.0000
R-squared Adj. 0.9231 0.3285 1.0000
x 4.0001***
(0.0003)
x1 5.1567*** 3.9553***
(0.0211) (0.0800)
z 6.0000***
(0.0010)
=====================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
1.3 simulating cofficient for 200 times#
def reg_result():
# generate IV , X , Z and Y
iv = np.array([random.gauss(10,10) for i in range(5000)])
z = np.array([random.randrange(-30,30) for i in range(5000)])
x = 3*iv + 2*z+ np.array([random.gauss(0,1) for i in range(5000)])
y = 4*x + 5*z + np.array([random.gauss(0,1) for i in range(5000)])
# regression
ols_fit = sm.OLS(endog = y, exog=x).fit()
# 2SLS
iv_fit_1stage = sm.OLS(endog = x, exog = iv).fit()
x_hat = iv_fit_1stage.predict()
iv_fit_2stage = sm.OLS(endog = y, exog = x_hat).fit()
return ols_fit.params[0], iv_fit_2stage.params[0]
ols = []
tsls = []
for i in range(500):
a,b=reg_result()
ols.append(a)
tsls.append(b)
from matplotlib import pyplot as plt
true_effect = 4
x = [true_effect]*120
y = range(0,120)
plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y, '--', color='black',linewidth = 3)
plt.hist(ols)
plt.hist(tsls)
plt.show()
# 如果IV 影響y呢? relax Exclusion Restriction
def reg_result():
# generate IV , X , Z and Y
iv = np.array([random.gauss(10,10) for i in range(5000)])
z = np.array([random.randrange(-30,30) for i in range(5000)])
x = 3*iv + 2*z+ np.array([random.gauss(0,1) for i in range(5000)])
y = 4*x + 5*z + 4*iv+ np.array([random.gauss(0,1) for i in range(5000)])
# regression
ols_fit = sm.OLS(endog = y, exog=x).fit()
# 2SLS
iv_fit_1stage = sm.OLS(endog = x, exog = iv).fit()
x_hat = iv_fit_1stage.predict()
iv_fit_2stage = sm.OLS(endog = y, exog = x_hat).fit()
return ols_fit.params[0], iv_fit_2stage.params[0]
ols = []
tsls = []
for i in range(500):
a,b=reg_result()
ols.append(a)
tsls.append(b)
from matplotlib import pyplot as plt
true_effect = 4
x = [true_effect]*120
y = range(0,120)
plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y, '--', color='black',linewidth = 3)
plt.hist(ols)
plt.hist(tsls)
plt.show()