Skip to article frontmatterSkip to article content

3. Multiple Regression Analysis: Estimation

%pip install numpy pandas patsy statsmodels wooldridge -q
Note: you may need to restart the kernel to use updated packages.
import numpy as np
import pandas as pd
import patsy as pt
import statsmodels.formula.api as smf
import statsmodels.stats.outliers_influence as smo
import wooldridge as wool

3.1 Multiple Regression in Practice

y=β0+β1x1+β2x2+β3x3++βkxk+uy = \beta_0 + \beta_1 x_1 + \beta_2 x_2 +\beta_3 x_3 + \cdots + \beta_k x_k + u

Example 3.1: Determinants of College GPA

colGPA=β0+β1hsGPA+β2ACT+u\text{colGPA} = \beta_0 + \beta_1 \text{hsGPA} + \beta_2 \text{ACT} + u
gpa1 = wool.data("gpa1")

reg = smf.ols(formula="colGPA ~ hsGPA + ACT", data=gpa1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 colGPA   R-squared:                       0.176
Model:                            OLS   Adj. R-squared:                  0.164
Method:                 Least Squares   F-statistic:                     14.78
Date:                Mon, 16 Sep 2024   Prob (F-statistic):           1.53e-06
Time:                        16:38:29   Log-Likelihood:                -46.573
No. Observations:                 141   AIC:                             99.15
Df Residuals:                     138   BIC:                             108.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.2863      0.341      3.774      0.000       0.612       1.960
hsGPA          0.4535      0.096      4.733      0.000       0.264       0.643
ACT            0.0094      0.011      0.875      0.383      -0.012       0.031
==============================================================================
Omnibus:                        3.056   Durbin-Watson:                   1.885
Prob(Omnibus):                  0.217   Jarque-Bera (JB):                2.469
Skew:                           0.199   Prob(JB):                        0.291
Kurtosis:                       2.488   Cond. No.                         298.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Example 3.3 Hourly Wage Equation

log(wage)=β0+β1educ+β2exper+β3tenure+u\log(\text{wage}) = \beta_0 + \beta_1 \text{educ} + \beta_2 \text{exper} + \beta_3 \text{tenure} + u
wage1 = wool.data("wage1")

reg = smf.ols(formula="np.log(wage) ~ educ + exper + tenure", data=wage1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(wage)   R-squared:                       0.316
Model:                            OLS   Adj. R-squared:                  0.312
Method:                 Least Squares   F-statistic:                     80.39
Date:                Mon, 16 Sep 2024   Prob (F-statistic):           9.13e-43
Time:                        16:38:29   Log-Likelihood:                -313.55
No. Observations:                 526   AIC:                             635.1
Df Residuals:                     522   BIC:                             652.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2844      0.104      2.729      0.007       0.080       0.489
educ           0.0920      0.007     12.555      0.000       0.078       0.106
exper          0.0041      0.002      2.391      0.017       0.001       0.008
tenure         0.0221      0.003      7.133      0.000       0.016       0.028
==============================================================================
Omnibus:                       11.534   Durbin-Watson:                   1.769
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               20.941
Skew:                           0.021   Prob(JB):                     2.84e-05
Kurtosis:                       3.977   Cond. No.                         135.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Example 3.4: Participation in 401(k) Pension Plans

prate=β0+β1mrate+β2age+u\text{prate} = \beta_0 + \beta_1 \text{mrate} + \beta_2 \text{age} + u
k401k = wool.data("401k")

reg = smf.ols(formula="prate ~ mrate + age", data=k401k)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  prate   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     77.79
Date:                Mon, 16 Sep 2024   Prob (F-statistic):           6.67e-33
Time:                        16:38:29   Log-Likelihood:                -6422.3
No. Observations:                1534   AIC:                         1.285e+04
Df Residuals:                    1531   BIC:                         1.287e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     80.1190      0.779    102.846      0.000      78.591      81.647
mrate          5.5213      0.526     10.499      0.000       4.490       6.553
age            0.2431      0.045      5.440      0.000       0.155       0.331
==============================================================================
Omnibus:                      375.579   Durbin-Watson:                   1.910
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              805.992
Skew:                          -1.387   Prob(JB):                    9.57e-176
Kurtosis:                       5.217   Cond. No.                         32.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Example 3.5a: Explaining Arrest Records

narr86=β0+β1pcnv+β2ptime86+β3qemp86+u\text{narr86} = \beta_0 + \beta_1 \text{pcnv} + \beta_2 \text{ptime86} + \beta_3 \text{qemp86} + u
crime1 = wool.data("crime1")

# model without avgsen:
reg = smf.ols(formula="narr86 ~ pcnv + ptime86 + qemp86", data=crime1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 narr86   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     39.10
Date:                Mon, 16 Sep 2024   Prob (F-statistic):           9.91e-25
Time:                        16:38:29   Log-Likelihood:                -3394.7
No. Observations:                2725   AIC:                             6797.
Df Residuals:                    2721   BIC:                             6821.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.7118      0.033     21.565      0.000       0.647       0.776
pcnv          -0.1499      0.041     -3.669      0.000      -0.230      -0.070
ptime86       -0.0344      0.009     -4.007      0.000      -0.051      -0.018
qemp86        -0.1041      0.010    -10.023      0.000      -0.124      -0.084
==============================================================================
Omnibus:                     2394.860   Durbin-Watson:                   1.836
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           106169.153
Skew:                           4.002   Prob(JB):                         0.00
Kurtosis:                      32.513   Cond. No.                         8.27
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Example 3.5b: Explaining Arrest Records

narr86=β0+β1pcnv+β2avgsen+β3ptime86+β4qemp86+u\text{narr86} = \beta_0 + \beta_1 \text{pcnv} + \beta_2 \text{avgsen} + \beta_3 \text{ptime86} + \beta_4 \text{qemp86} + u
crime1 = wool.data("crime1")

# model with avgsen:
reg = smf.ols(formula="narr86 ~ pcnv + avgsen + ptime86 + qemp86", data=crime1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 narr86   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     29.96
Date:                Mon, 16 Sep 2024   Prob (F-statistic):           2.01e-24
Time:                        16:38:29   Log-Likelihood:                -3393.5
No. Observations:                2725   AIC:                             6797.
Df Residuals:                    2720   BIC:                             6826.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.7068      0.033     21.319      0.000       0.642       0.772
pcnv          -0.1508      0.041     -3.692      0.000      -0.231      -0.071
avgsen         0.0074      0.005      1.572      0.116      -0.002       0.017
ptime86       -0.0374      0.009     -4.252      0.000      -0.055      -0.020
qemp86        -0.1033      0.010     -9.940      0.000      -0.124      -0.083
==============================================================================
Omnibus:                     2396.990   Durbin-Watson:                   1.837
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           106841.658
Skew:                           4.006   Prob(JB):                         0.00
Kurtosis:                      32.611   Cond. No.                         10.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Example 3.6: Hourly Wage Equation

log(wage)=β0+β1educ+u\log(\text{wage}) = \beta_0 + \beta_1 \text{educ} + u
wage1 = wool.data("wage1")

reg = smf.ols(formula="np.log(wage) ~ educ", data=wage1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(wage)   R-squared:                       0.186
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     119.6
Date:                Mon, 16 Sep 2024   Prob (F-statistic):           3.27e-25
Time:                        16:38:29   Log-Likelihood:                -359.38
No. Observations:                 526   AIC:                             722.8
Df Residuals:                     524   BIC:                             731.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5838      0.097      5.998      0.000       0.393       0.775
educ           0.0827      0.008     10.935      0.000       0.068       0.098
==============================================================================
Omnibus:                       11.804   Durbin-Watson:                   1.801
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               13.811
Skew:                           0.268   Prob(JB):                      0.00100
Kurtosis:                       3.586   Cond. No.                         60.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

3.2 OLS in Matrix Form

β^=(XX)1Xy\hat{\beta} = (X'X)^{-1}X'y
gpa1 = wool.data("gpa1")

# determine sample size & no. of regressors:
n = len(gpa1)
k = 2

# extract y:
y = gpa1["colGPA"]

# extract X & add a column of ones:
X = pd.DataFrame({"const": 1, "hsGPA": gpa1["hsGPA"], "ACT": gpa1["ACT"]})

# alternative with patsy:
y2, X2 = pt.dmatrices("colGPA ~ hsGPA + ACT", data=gpa1, return_type="dataframe")

# display first rows of X:
print(f"X.head(): \n{X.head()}\n")
X.head(): 
   const  hsGPA  ACT
0      1    3.0   21
1      1    3.2   24
2      1    3.6   26
3      1    3.5   27
4      1    3.9   28

# parameter estimates:
X = np.array(X)
y = np.array(y).reshape(n, 1)  # creates a row vector
b = np.linalg.inv(X.T @ X) @ X.T @ y
print(f"b: \n{b}\n")
b: 
[[1.28632777]
 [0.45345589]
 [0.00942601]]

u^=yXβ^\hat{u} = y - X\hat{\beta}
σ^2=1nk1u^u^\hat{\sigma}^2 = \frac{1}{n-k-1} \hat{u}'\hat{u}
# residuals, estimated variance of u and SER:
u_hat = y - X @ b
sigsq_hat = (u_hat.T @ u_hat) / (n - k - 1)
SER = np.sqrt(sigsq_hat)
print(f"SER: {SER}\n")
SER: [[0.34031576]]

var(β^)^=σ^2(XX)1\widehat{\text{var}(\hat{\beta})} = \hat{\sigma}^2 (X'X)^{-1}
# estimated variance of the parameter estimators and SE:
Vbeta_hat = sigsq_hat * np.linalg.inv(X.T @ X)
se = np.sqrt(np.diagonal(Vbeta_hat))
print(f"se: {se}\n")
se: [0.34082212 0.09581292 0.01077719]

3.3 Ceteris Paribus Interpretation and Omitted Variable Bias

gpa1 = wool.data("gpa1")

# parameter estimates for full and simple model:
reg = smf.ols(formula="colGPA ~ ACT + hsGPA", data=gpa1)
results = reg.fit()
b = results.params
print(f"b: \n{b}\n")
b: 
Intercept    1.286328
ACT          0.009426
hsGPA        0.453456
dtype: float64

# relation between regressors:
reg_delta = smf.ols(formula="hsGPA ~ ACT", data=gpa1)
results_delta = reg_delta.fit()
delta_tilde = results_delta.params
print(f"delta_tilde: \n{delta_tilde}\n")
delta_tilde: 
Intercept    2.462537
ACT          0.038897
dtype: float64

# omitted variables formula for b1_tilde:
b1_tilde = b["ACT"] + b["hsGPA"] * delta_tilde["ACT"]
print(f"b1_tilde:  \n{b1_tilde}\n")
b1_tilde:  
0.027063973943178603

# actual regression with hsGPA omitted:
reg_om = smf.ols(formula="colGPA ~ ACT", data=gpa1)
results_om = reg_om.fit()
b_om = results_om.params
print(f"b_om: \n{b_om}\n")
b_om: 
Intercept    2.402979
ACT          0.027064
dtype: float64

3.4 Standard Errors, Multicollinearity, and VIF

gpa1 = wool.data("gpa1")

# full estimation results including automatic SE:
reg = smf.ols(formula="colGPA ~ hsGPA + ACT", data=gpa1)
results = reg.fit()

# extract SER (instead of calculation via residuals):
SER = np.sqrt(results.mse_resid)

# regressing hsGPA on ACT for calculation of R2 & VIF:
reg_hsGPA = smf.ols(formula="hsGPA ~ ACT", data=gpa1)
results_hsGPA = reg_hsGPA.fit()
R2_hsGPA = results_hsGPA.rsquared
VIF_hsGPA = 1 / (1 - R2_hsGPA)
print(f"VIF_hsGPA: {VIF_hsGPA}\n")
VIF_hsGPA: 1.1358234481972784

# manual calculation of SE of hsGPA coefficient:
n = results.nobs
sdx = np.std(gpa1["hsGPA"], ddof=1) * np.sqrt((n - 1) / n)
SE_hsGPA = 1 / np.sqrt(n) * SER / sdx * np.sqrt(VIF_hsGPA)
print(f"SE_hsGPA: {SE_hsGPA}\n")
SE_hsGPA: 0.09581291608057595

wage1 = wool.data("wage1")

# extract matrices using patsy:
y, X = pt.dmatrices(
    "np.log(wage) ~ educ + exper + tenure",
    data=wage1,
    return_type="dataframe",
)

# get VIF:
K = X.shape[1]
VIF = np.empty(K)
for i in range(K):
    VIF[i] = smo.variance_inflation_factor(X.values, i)
print(f"VIF: \n{VIF}\n")
VIF: 
[29.37890286  1.11277075  1.47761777  1.34929556]