因果图(Causal Graph)#

import networkx as nx
# import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
np.random.seed(1000)

Collider (对撞结构)#

  • Suppose z is collider, x influence z, and y influence z, no relationship between x and y

draw_options = {
    "arrowsize": 20,
    "font_size": 20,
    "node_size": 1000,
    "node_color": "white",
    "edgecolors": "black",
    "linewidths": 1,
    "width": 1,
    "with_labels": True,
}

# 构建有向图并添加结点和边
Collider = nx.DiGraph()
Collider.add_nodes_from(list('xyz'))
Collider.add_edges_from([('x','z'),('y','z')])

pos = {'x': (-0.3,0), 'z': (0,0.2), 'y': (0.3,0)} 

nx.draw(Collider, **draw_options,pos =pos)
../../_images/df1a59be698d4c5f27e18826fa9d2d50884634b727aaf1602fe790793f41496b.png
# generate x, y
x = 10+50*np.random.randn(100000)
y = -20 + 30*np.random.rand(100000)

print(np.corrcoef(x,y)[0,1])

# generate z
z = x + y + 1.5*np.random.randn(100000)
0.00035713020604406597
  • fit model

    • (1) \(y_i = \alpha + \beta x_i +\epsilon_i\)

    • (2) \(y_i = \alpha + \beta x_i + \gamma z_i + \epsilon_i\)

cons = np.array([1]*100000)
fit = sm.OLS(endog = y, exog = np.array([cons,x]).T).fit()
# print(fit.summary())

fit2 = sm.OLS(endog = y, exog = np.array([cons,x,z]).T).fit()
#print(fit2.summary())

# Presenting result
result_table = summary_col(results = [fit, fit2],
                          float_format = '%.4f',
                          model_names = ['Model 1', 'Model 2'],
                          stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
        Model 1 v.s. Model 2
====================================
                Model 1    Model 2  
------------------------------------
R-squared      0.0000     0.9709    
R-squared Adj. -0.0000    0.9709    
const          -4.9667*** -0.1411***
               (0.0279)   (0.0054)  
x1             0.0001     -0.9700***
               (0.0005)   (0.0005)  
x2                        0.9702*** 
                          (0.0005)  
====================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Fork#

  • 混杂变量,confounder

# generate z,x, y
z = 200 + np.random.randn(100000)
x = 10+50*z + np.random.randn(100000)
y = -20 + 30*z + 3*np.random.rand(100000)

print(np.corrcoef(x,y)[0,1])

# # generate z
# z = x + y + 1.5*np.random.randn(100000)

cons = np.array([1]*100000)
fit = sm.OLS(endog = y, exog = np.array([cons,x]).T).fit()
# print(fit.summary())

fit2 = sm.OLS(endog = y, exog = np.array([cons,x,z]).T).fit()
#print(fit2.summary())

# Presenting result
result_table = summary_col(results = [fit, fit2],
                          float_format = '%.4f',
                          model_names = ['Model 1', 'Model 2'],
                          stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
0.9993873947603757
         Model 1 v.s. Model 2
======================================
                 Model 1     Model 2  
--------------------------------------
R-squared      0.9988      0.9992     
R-squared Adj. 0.9988      0.9992     
const          -20.1788*** -17.1419***
               (0.6646)    (0.5472)   
x1             0.5996***   0.0016     
               (0.0001)    (0.0027)   
x2                         29.9135*** 
                           (0.1371)   
======================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
draw_options = {
    "arrowsize": 20,
    "font_size": 20,
    "node_size": 1000,
    "node_color": "white",
    "edgecolors": "black",
    "linewidths": 1,
    "width": 1,
    "with_labels": True,
}

# 构建有向图并添加结点和边
Collider = nx.DiGraph()
Collider.add_nodes_from(list('xyz'))
Collider.add_edges_from([('z','x'),('z','y')])

pos = {'x': (-0.3,0), 'z': (0,0.2), 'y': (0.3,0)} 

nx.draw(Collider, **draw_options,pos =pos)
../../_images/2d5407c05e509d6368444c727316819c9b395b2bff59ed183c9716808ea9387d.png
draw_options = {
    "arrowsize": 20,
    "font_size": 20,
    "node_size": 1000,
    "node_color": "white",
    "edgecolors": "black",
    "linewidths": 1,
    "width": 1,
    "with_labels": True,
}

# 构建有向图并添加结点和边
G = nx.DiGraph()
G.add_nodes_from(list('ABCPQ'))
G.add_edges_from([('P','Q'),('A','P'),('A','Q')])
G.add_edges_from([('P','C'),('C','Q')])
G.add_edges_from([('B','P')])

pos = {'B': (-0.6,0), 'P': (0,0), 'Q': (0.7,0),
        'A':(0.35, -0.1), 'C':(0.35,0.1)}

nx.draw(G, **draw_options,pos =pos)
../../_images/d6c923840c99decc36d73cde9c5747166b4c239292f64278e4126289cef87302.png
data_size = 100000
b = 100 + 10*np.random.randn(data_size)
a = 60 + 30*np.random.randn(data_size)
p = 10 + 2*b + 0.8*a + np.random.randn(data_size)
c = 2 + p + np.random.randn(data_size)
q = 400 - p + 0.2*c + 2*a + np.random.randn(data_size)
# total price effect
# q = 400.4 - 0.8*p + 2*a + noise
  • fit model

    • (1) \(q_i = \alpha + \beta p_i +\epsilon_i\)

    • (2) \(q_i = \alpha + \beta p_i + \gamma a_i + \epsilon_i\)

cons = np.array([1]*data_size)
fit = sm.OLS(endog = q, exog = np.array([cons,p]).T).fit()
# print(fit.summary())

fit2 = sm.OLS(endog = q, exog = np.array([cons,p, a]).T).fit()
#print(fit2.summary())

# Presenting result
result_table = summary_col(results = [fit, fit2],
                          float_format = '%.4f',
                          model_names = ['Model 1', 'Model 2'],
                          stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
         Model 1 v.s. Model 2
======================================
                 Model 1     Model 2  
--------------------------------------
R-squared      0.2352      0.9995     
R-squared Adj. 0.2352      0.9995     
const          138.1122*** 400.4521***
               (1.0109)    (0.0345)   
x1             0.6819***   -0.8002*** 
               (0.0039)    (0.0002)   
x2                         2.0001***  
                           (0.0002)   
======================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
# OLS
# ols_fit = sm.OLS(endog = q, exog = np.array([cons,p]).T).fit()
# 2SLS
iv_fit_1stage = sm.OLS(endog = p, exog = np.array([cons,b]).T).fit()
                       
x_hat = iv_fit_1stage.predict()
iv_fit_2stage = sm.OLS(endog = q, exog = np.array([cons,x_hat]).T).fit()

# control confounder 
# ols_confounder = sm.OLS(data['y'], data[['x','z']]).fit()

# presenting result
# from statsmodels.iolib.summary2 import summary_col

result_table = summary_col(results = [fit, fit2, iv_fit_2stage],
                          float_format = '%.4f',
                          model_names = ['Model1', 'Model 2', 'Model 1 with IV'],
                          stars = True)
result_table.add_title('IV')
print(result_table)
# print(ols_fit.summary())
# print(iv_fit_2stage.summary())
                          IV
======================================================
                  Model1     Model 2   Model 1 with IV
------------------------------------------------------
R-squared      0.2352      0.9995      0.1324         
R-squared Adj. 0.2352      0.9995      0.1324         
const          138.1122*** 400.4521*** 521.4658***    
               (1.0109)    (0.0345)    (1.6836)       
x1             0.6819***   -0.8002***  -0.8037***     
               (0.0039)    (0.0002)    (0.0065)       
x2                         2.0001***                  
                           (0.0002)                   
======================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Chain#

draw_options = {
    "arrowsize": 20,
    "font_size": 20,
    "node_size": 1000,
    "node_color": "white",
    "edgecolors": "black",
    "linewidths": 1,
    "width": 1,
    "with_labels": True,
}

# 构建有向图并添加结点和边
chain = nx.DiGraph()
chain.add_nodes_from(list('xyz'))
chain.add_edges_from([('x','z'),('z','y')])

pos = {'x': (-0.3,0), 'z': (0,0), 'y': (0.3,0)} 
nx.draw(chain, **draw_options,pos =pos)
../../_images/d0a118209c1e4057cd912954ff27b17d15f981782eb09a06f90eb51ea101941e.png
data_size = 100000
x = 100 + 10*np.random.randn(data_size)
z = 3*x+  np.random.randn(data_size)
y = 5*z+ np.random.randn(data_size)
  • fit model

    • (1) \(y_i = \alpha + \beta x_i +\epsilon_i\)

    • (2) \(y_i = \alpha + \beta x_i + \gamma z_i + \epsilon_i\)

cons = np.array([1]*data_size)
fit = sm.OLS(endog = y, exog = np.array([cons,x]).T).fit()
# print(fit.summary())

fit2 = sm.OLS(endog = y, exog = np.array([cons,x, z]).T).fit()
#print(fit2.summary())

# Presenting result
result_table = summary_col(results = [fit, fit2],
                          float_format = '%.4f',
                          model_names = ['Model 1', 'Model 2'],
                          stars = True)
result_table.add_title('Model 1 v.s. Model 2')
print(result_table)
       Model 1 v.s. Model 2
===================================
                Model 1    Model 2 
-----------------------------------
R-squared      0.9988     1.0000   
R-squared Adj. 0.9988     1.0000   
const          -0.4012**  -0.0434  
               (0.1620)   (0.0316) 
x1             15.0039*** -0.0024  
               (0.0016)   (0.0094) 
x2                        5.0010***
                          (0.0031) 
===================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Simulation Practice: Quantifying causality in data science with quasi-experiments#

IV simu#

causal graph:

np.random.seed(2021)
z = np.random.randn()

1.1 Simu data#

import random
random.seed(2021)  #设定种子
# generate IV , X , Z and Y
iv = np.array([random.gauss(10,10) for i in range(5000)])
z = np.array([random.randrange(-30,30) for i in range(5000)])
x = 3*iv + 2*z+ np.array([random.gauss(0,1) for i in range(5000)])
y = 4*x + 6*z + np.array([random.gauss(0,1) for i in range(5000)])

print('IV first 20 data: ')
print(iv[0:20])
print('-'*20)

print('y first 20 data: ')
print(y[0:20])
print('-'*20)
IV first 20 data: 
[ 20.20586392  -6.93081101   2.74935374   7.9373736   -7.82976154
 -10.22917018   6.41968456  10.59498401   8.58104578  21.26374151
  15.07168142   6.92463389  17.90393551  -4.47388035   4.1944725
  -1.18214967  27.76604789  15.89608828   3.46856887  11.36222159]
--------------------
y first 20 data: 
[-186.54764132   89.24934793 -115.40383367  -17.07299347 -192.41523531
 -208.01767037 -220.04205714  259.29547667  398.37124418  213.11254403
  192.87980195  486.78183573   21.63838191  261.54467099 -160.23441493
   50.70465479  292.82361655  485.63725094  195.58686214   34.25125174]
--------------------
import pandas as pd 
df = pd.DataFrame([y,x,z,iv])
data = pd.DataFrame(df.values.T,columns = ['y','x','z','iv'])

1.2 Regression#

import statsmodels.api as sm
# OLS
ols_fit = sm.OLS(endog = y, exog=x).fit()
# 2SLS
iv_fit_1stage = sm.OLS(endog = x, exog = iv).fit()
x_hat = iv_fit_1stage.predict()
iv_fit_2stage = sm.OLS(endog = y, exog = x_hat).fit()

# control confounder 
ols_confounder = sm.OLS(data['y'], data[['x','z']]).fit()

# presenting result
from statsmodels.iolib.summary2 import summary_col

result_table = summary_col(results = [ols_fit, iv_fit_2stage, ols_confounder],
                          float_format = '%.4f',
                          model_names = ['OLS', '2SLS','control confounder'],
                          stars = True)
result_table.add_title('OLS v.s. 2SLS')
print(result_table)
# print(ols_fit.summary())
# print(iv_fit_2stage.summary())
                    OLS v.s. 2SLS
=====================================================
                  OLS       2SLS   control confounder
-----------------------------------------------------
R-squared      0.9231    0.3286    1.0000            
R-squared Adj. 0.9231    0.3285    1.0000            
x                                  4.0001***         
                                   (0.0003)          
x1             5.1567*** 3.9553***                   
               (0.0211)  (0.0800)                    
z                                  6.0000***         
                                   (0.0010)          
=====================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

1.3 simulating cofficient for 200 times#

def reg_result():
    # generate IV , X , Z and Y
    iv = np.array([random.gauss(10,10) for i in range(5000)])
    z = np.array([random.randrange(-30,30) for i in range(5000)])
    x = 3*iv + 2*z+ np.array([random.gauss(0,1) for i in range(5000)])
    y = 4*x + 5*z + np.array([random.gauss(0,1) for i in range(5000)])
    
    # regression

    ols_fit = sm.OLS(endog = y, exog=x).fit()
    # 2SLS
    iv_fit_1stage = sm.OLS(endog = x, exog = iv).fit()
    x_hat = iv_fit_1stage.predict()
    iv_fit_2stage = sm.OLS(endog = y, exog = x_hat).fit()
    return ols_fit.params[0], iv_fit_2stage.params[0]


ols = []
tsls = []
for i in range(500):
    a,b=reg_result()
    ols.append(a)
    tsls.append(b)
from matplotlib import pyplot as plt
true_effect  = 4
x = [true_effect]*120
y = range(0,120)
plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y, '--', color='black',linewidth = 3)

plt.hist(ols)
plt.hist(tsls)
plt.show()
../../_images/2078e9ec49b7ad016bc29410459163f529e8c1bbf78c12e692f611c10fc0be29.png
# 如果IV 影響y呢? relax Exclusion Restriction

def reg_result():
    # generate IV , X , Z and Y
    iv = np.array([random.gauss(10,10) for i in range(5000)])
    z = np.array([random.randrange(-30,30) for i in range(5000)])
    x = 3*iv + 2*z+ np.array([random.gauss(0,1) for i in range(5000)])
    y = 4*x + 5*z + 4*iv+ np.array([random.gauss(0,1) for i in range(5000)])
    
    # regression

    ols_fit = sm.OLS(endog = y, exog=x).fit()
    # 2SLS
    iv_fit_1stage = sm.OLS(endog = x, exog = iv).fit()
    x_hat = iv_fit_1stage.predict()
    iv_fit_2stage = sm.OLS(endog = y, exog = x_hat).fit()
    return ols_fit.params[0], iv_fit_2stage.params[0]


ols = []
tsls = []
for i in range(500):
    a,b=reg_result()
    ols.append(a)
    tsls.append(b)
from matplotlib import pyplot as plt
true_effect  = 4
x = [true_effect]*120
y = range(0,120)
plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y, '--', color='black',linewidth = 3)

plt.hist(ols)
plt.hist(tsls)
plt.show()
../../_images/0214fef5662b12da5281a2383d9a4950e037271fc2dee4c39a5d4c9f50475d8b.png