%pip install numpy pandas patsy statsmodels wooldridge -q
Note: you may need to restart the kernel to use updated packages.
import numpy as np
import pandas as pd
import patsy as pt
import statsmodels.formula.api as smf
import statsmodels.stats.outliers_influence as smo
import wooldridge as wool
gpa1 = wool.data("gpa1")
reg = smf.ols(formula="colGPA ~ hsGPA + ACT", data=gpa1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: colGPA R-squared: 0.176
Model: OLS Adj. R-squared: 0.164
Method: Least Squares F-statistic: 14.78
Date: Mon, 16 Sep 2024 Prob (F-statistic): 1.53e-06
Time: 16:38:29 Log-Likelihood: -46.573
No. Observations: 141 AIC: 99.15
Df Residuals: 138 BIC: 108.0
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 1.2863 0.341 3.774 0.000 0.612 1.960
hsGPA 0.4535 0.096 4.733 0.000 0.264 0.643
ACT 0.0094 0.011 0.875 0.383 -0.012 0.031
==============================================================================
Omnibus: 3.056 Durbin-Watson: 1.885
Prob(Omnibus): 0.217 Jarque-Bera (JB): 2.469
Skew: 0.199 Prob(JB): 0.291
Kurtosis: 2.488 Cond. No. 298.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
wage1 = wool.data("wage1")
reg = smf.ols(formula="np.log(wage) ~ educ + exper + tenure", data=wage1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: np.log(wage) R-squared: 0.316
Model: OLS Adj. R-squared: 0.312
Method: Least Squares F-statistic: 80.39
Date: Mon, 16 Sep 2024 Prob (F-statistic): 9.13e-43
Time: 16:38:29 Log-Likelihood: -313.55
No. Observations: 526 AIC: 635.1
Df Residuals: 522 BIC: 652.2
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.2844 0.104 2.729 0.007 0.080 0.489
educ 0.0920 0.007 12.555 0.000 0.078 0.106
exper 0.0041 0.002 2.391 0.017 0.001 0.008
tenure 0.0221 0.003 7.133 0.000 0.016 0.028
==============================================================================
Omnibus: 11.534 Durbin-Watson: 1.769
Prob(Omnibus): 0.003 Jarque-Bera (JB): 20.941
Skew: 0.021 Prob(JB): 2.84e-05
Kurtosis: 3.977 Cond. No. 135.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
k401k = wool.data("401k")
reg = smf.ols(formula="prate ~ mrate + age", data=k401k)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: prate R-squared: 0.092
Model: OLS Adj. R-squared: 0.091
Method: Least Squares F-statistic: 77.79
Date: Mon, 16 Sep 2024 Prob (F-statistic): 6.67e-33
Time: 16:38:29 Log-Likelihood: -6422.3
No. Observations: 1534 AIC: 1.285e+04
Df Residuals: 1531 BIC: 1.287e+04
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 80.1190 0.779 102.846 0.000 78.591 81.647
mrate 5.5213 0.526 10.499 0.000 4.490 6.553
age 0.2431 0.045 5.440 0.000 0.155 0.331
==============================================================================
Omnibus: 375.579 Durbin-Watson: 1.910
Prob(Omnibus): 0.000 Jarque-Bera (JB): 805.992
Skew: -1.387 Prob(JB): 9.57e-176
Kurtosis: 5.217 Cond. No. 32.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
crime1 = wool.data("crime1")
# model without avgsen:
reg = smf.ols(formula="narr86 ~ pcnv + ptime86 + qemp86", data=crime1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: narr86 R-squared: 0.041
Model: OLS Adj. R-squared: 0.040
Method: Least Squares F-statistic: 39.10
Date: Mon, 16 Sep 2024 Prob (F-statistic): 9.91e-25
Time: 16:38:29 Log-Likelihood: -3394.7
No. Observations: 2725 AIC: 6797.
Df Residuals: 2721 BIC: 6821.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.7118 0.033 21.565 0.000 0.647 0.776
pcnv -0.1499 0.041 -3.669 0.000 -0.230 -0.070
ptime86 -0.0344 0.009 -4.007 0.000 -0.051 -0.018
qemp86 -0.1041 0.010 -10.023 0.000 -0.124 -0.084
==============================================================================
Omnibus: 2394.860 Durbin-Watson: 1.836
Prob(Omnibus): 0.000 Jarque-Bera (JB): 106169.153
Skew: 4.002 Prob(JB): 0.00
Kurtosis: 32.513 Cond. No. 8.27
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
crime1 = wool.data("crime1")
# model with avgsen:
reg = smf.ols(formula="narr86 ~ pcnv + avgsen + ptime86 + qemp86", data=crime1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: narr86 R-squared: 0.042
Model: OLS Adj. R-squared: 0.041
Method: Least Squares F-statistic: 29.96
Date: Mon, 16 Sep 2024 Prob (F-statistic): 2.01e-24
Time: 16:38:29 Log-Likelihood: -3393.5
No. Observations: 2725 AIC: 6797.
Df Residuals: 2720 BIC: 6826.
Df Model: 4
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.7068 0.033 21.319 0.000 0.642 0.772
pcnv -0.1508 0.041 -3.692 0.000 -0.231 -0.071
avgsen 0.0074 0.005 1.572 0.116 -0.002 0.017
ptime86 -0.0374 0.009 -4.252 0.000 -0.055 -0.020
qemp86 -0.1033 0.010 -9.940 0.000 -0.124 -0.083
==============================================================================
Omnibus: 2396.990 Durbin-Watson: 1.837
Prob(Omnibus): 0.000 Jarque-Bera (JB): 106841.658
Skew: 4.006 Prob(JB): 0.00
Kurtosis: 32.611 Cond. No. 10.2
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
wage1 = wool.data("wage1")
reg = smf.ols(formula="np.log(wage) ~ educ", data=wage1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: np.log(wage) R-squared: 0.186
Model: OLS Adj. R-squared: 0.184
Method: Least Squares F-statistic: 119.6
Date: Mon, 16 Sep 2024 Prob (F-statistic): 3.27e-25
Time: 16:38:29 Log-Likelihood: -359.38
No. Observations: 526 AIC: 722.8
Df Residuals: 524 BIC: 731.3
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.5838 0.097 5.998 0.000 0.393 0.775
educ 0.0827 0.008 10.935 0.000 0.068 0.098
==============================================================================
Omnibus: 11.804 Durbin-Watson: 1.801
Prob(Omnibus): 0.003 Jarque-Bera (JB): 13.811
Skew: 0.268 Prob(JB): 0.00100
Kurtosis: 3.586 Cond. No. 60.2
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
gpa1 = wool.data("gpa1")
# determine sample size & no. of regressors:
n = len(gpa1)
k = 2
# extract y:
y = gpa1["colGPA"]
# extract X & add a column of ones:
X = pd.DataFrame({"const": 1, "hsGPA": gpa1["hsGPA"], "ACT": gpa1["ACT"]})
# alternative with patsy:
y2, X2 = pt.dmatrices("colGPA ~ hsGPA + ACT", data=gpa1, return_type="dataframe")
# display first rows of X:
print(f"X.head(): \n{X.head()}\n")
X.head():
const hsGPA ACT
0 1 3.0 21
1 1 3.2 24
2 1 3.6 26
3 1 3.5 27
4 1 3.9 28
# parameter estimates:
X = np.array(X)
y = np.array(y).reshape(n, 1) # creates a row vector
b = np.linalg.inv(X.T @ X) @ X.T @ y
print(f"b: \n{b}\n")
b:
[[1.28632777]
[0.45345589]
[0.00942601]]
# residuals, estimated variance of u and SER:
u_hat = y - X @ b
sigsq_hat = (u_hat.T @ u_hat) / (n - k - 1)
SER = np.sqrt(sigsq_hat)
print(f"SER: {SER}\n")
SER: [[0.34031576]]
# estimated variance of the parameter estimators and SE:
Vbeta_hat = sigsq_hat * np.linalg.inv(X.T @ X)
se = np.sqrt(np.diagonal(Vbeta_hat))
print(f"se: {se}\n")
se: [0.34082212 0.09581292 0.01077719]
3.3 Ceteris Paribus Interpretation and Omitted Variable Bias¶
gpa1 = wool.data("gpa1")
# parameter estimates for full and simple model:
reg = smf.ols(formula="colGPA ~ ACT + hsGPA", data=gpa1)
results = reg.fit()
b = results.params
print(f"b: \n{b}\n")
b:
Intercept 1.286328
ACT 0.009426
hsGPA 0.453456
dtype: float64
# relation between regressors:
reg_delta = smf.ols(formula="hsGPA ~ ACT", data=gpa1)
results_delta = reg_delta.fit()
delta_tilde = results_delta.params
print(f"delta_tilde: \n{delta_tilde}\n")
delta_tilde:
Intercept 2.462537
ACT 0.038897
dtype: float64
# omitted variables formula for b1_tilde:
b1_tilde = b["ACT"] + b["hsGPA"] * delta_tilde["ACT"]
print(f"b1_tilde: \n{b1_tilde}\n")
b1_tilde:
0.027063973943178603
# actual regression with hsGPA omitted:
reg_om = smf.ols(formula="colGPA ~ ACT", data=gpa1)
results_om = reg_om.fit()
b_om = results_om.params
print(f"b_om: \n{b_om}\n")
b_om:
Intercept 2.402979
ACT 0.027064
dtype: float64
3.4 Standard Errors, Multicollinearity, and VIF¶
gpa1 = wool.data("gpa1")
# full estimation results including automatic SE:
reg = smf.ols(formula="colGPA ~ hsGPA + ACT", data=gpa1)
results = reg.fit()
# extract SER (instead of calculation via residuals):
SER = np.sqrt(results.mse_resid)
# regressing hsGPA on ACT for calculation of R2 & VIF:
reg_hsGPA = smf.ols(formula="hsGPA ~ ACT", data=gpa1)
results_hsGPA = reg_hsGPA.fit()
R2_hsGPA = results_hsGPA.rsquared
VIF_hsGPA = 1 / (1 - R2_hsGPA)
print(f"VIF_hsGPA: {VIF_hsGPA}\n")
VIF_hsGPA: 1.1358234481972784
# manual calculation of SE of hsGPA coefficient:
n = results.nobs
sdx = np.std(gpa1["hsGPA"], ddof=1) * np.sqrt((n - 1) / n)
SE_hsGPA = 1 / np.sqrt(n) * SER / sdx * np.sqrt(VIF_hsGPA)
print(f"SE_hsGPA: {SE_hsGPA}\n")
SE_hsGPA: 0.09581291608057595
wage1 = wool.data("wage1")
# extract matrices using patsy:
y, X = pt.dmatrices(
"np.log(wage) ~ educ + exper + tenure",
data=wage1,
return_type="dataframe",
)
# get VIF:
K = X.shape[1]
VIF = np.empty(K)
for i in range(K):
VIF[i] = smo.variance_inflation_factor(X.values, i)
print(f"VIF: \n{VIF}\n")
VIF:
[29.37890286 1.11277075 1.47761777 1.34929556]