Skip to article frontmatterSkip to article content

4. Multiple Regression Analysis: Inference

%pip install numpy statsmodels wooldridge scipy -q
Note: you may need to restart the kernel to use updated packages.
import numpy as np
import statsmodels.formula.api as smf
import wooldridge as wool
from scipy import stats

4.1 The tt Test

4.1.1 General Setup

H0:βj=ajH_0: \beta_j = a_j
H1:βjajorH1:βj>ajorH1:βj<ajH_1: \beta_j \neq a_j \quad \text{or} \quad H_1:\beta_j > a_j \quad \text{or} \quad H_1:\beta_j < a_j
t=β^jajse(β^j)t = \frac{\hat{\beta}_j - a_j}{se(\hat{\beta}_j)}

4.1.2 Standard Case

H0:βj=0,H1:βj0H_0: \beta_j = 0, \qquad H_1: \beta_j \neq 0
tβ^j=β^jse(β^j)t_{\hat{\beta}_j} = \frac{\hat{\beta}_j}{se(\hat{\beta}_j)}
reject H0 if tβ^j>c\text{reject } H_0 \text{ if } |t_{\hat{\beta}_j}| > c
pβ^j=2Ftnk1(tβ^j)p_{\hat{\beta}_j} = 2 \cdot F_{t_{n-k-1}}(-|t_{\hat{\beta}_j}|)
reject H0 if pβ^j<α\text{reject } H_0 \text{ if } p_{\hat{\beta}_j} < \alpha

Example 4.3: Determinants of College GPA

colGPA=β0+β1hsGPA+β2ACT+β3skipped+u\text{colGPA} = \beta_0 + \beta_1 \text{hsGPA} + \beta_2 \text{ACT} + \beta_3 \text{skipped} + u
# CV for alpha=5% and 1% using the t distribution with 137 d.f.:
alpha = np.array([0.05, 0.01])
cv_t = stats.t.ppf(1 - alpha / 2, 137)
print(f"cv_t: {cv_t}\n")
cv_t: [1.97743121 2.61219198]

# CV for alpha=5% and 1% using the normal approximation:
cv_n = stats.norm.ppf(1 - alpha / 2)
print(f"cv_n: {cv_n}\n")
cv_n: [1.95996398 2.5758293 ]

gpa1 = wool.data("gpa1")

# store and display results:
reg = smf.ols(formula="colGPA ~ hsGPA + ACT + skipped", data=gpa1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 colGPA   R-squared:                       0.234
Model:                            OLS   Adj. R-squared:                  0.217
Method:                 Least Squares   F-statistic:                     13.92
Date:                Tue, 24 Sep 2024   Prob (F-statistic):           5.65e-08
Time:                        14:59:27   Log-Likelihood:                -41.501
No. Observations:                 141   AIC:                             91.00
Df Residuals:                     137   BIC:                             102.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3896      0.332      4.191      0.000       0.734       2.045
hsGPA          0.4118      0.094      4.396      0.000       0.227       0.597
ACT            0.0147      0.011      1.393      0.166      -0.006       0.036
skipped       -0.0831      0.026     -3.197      0.002      -0.135      -0.032
==============================================================================
Omnibus:                        1.917   Durbin-Watson:                   1.881
Prob(Omnibus):                  0.383   Jarque-Bera (JB):                1.636
Skew:                           0.125   Prob(JB):                        0.441
Kurtosis:                       2.535   Cond. No.                         300.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# manually confirm the formulas, i.e. extract coefficients and SE:
b = results.params
se = results.bse

# reproduce t statistic:
tstat = b / se
print(f"tstat: \n{tstat}\n")

# reproduce p value:
pval = 2 * stats.t.cdf(-abs(tstat), 137)
print(f"pval: \n{pval}\n")
tstat: 
Intercept    4.191039
hsGPA        4.396260
ACT          1.393319
skipped     -3.196840
dtype: float64

pval: 
[4.95026897e-05 2.19205015e-05 1.65779902e-01 1.72543113e-03]

4.1.3 Other Hypotheses

Example 4.1: Hourly Wage Equation

log(wage)=β0+β1educ+β2exper+β3tenure+u\log(\text{wage}) = \beta_0 + \beta_1 \text{educ} + \beta_2 \text{exper} + \beta_3 \text{tenure} + u
# CV for alpha=5% and 1% using the t distribution with 522 d.f.:
alpha = np.array([0.05, 0.01])
cv_t = stats.t.ppf(1 - alpha, 522)
print(f"cv_t: {cv_t}\n")
cv_t: [1.64777794 2.33351273]

# CV for alpha=5% and 1% using the normal approximation:
cv_n = stats.norm.ppf(1 - alpha)
print(f"cv_n: {cv_n}\n")
cv_n: [1.64485363 2.32634787]

wage1 = wool.data("wage1")

reg = smf.ols(formula="np.log(wage) ~ educ + exper + tenure", data=wage1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(wage)   R-squared:                       0.316
Model:                            OLS   Adj. R-squared:                  0.312
Method:                 Least Squares   F-statistic:                     80.39
Date:                Tue, 24 Sep 2024   Prob (F-statistic):           9.13e-43
Time:                        14:59:28   Log-Likelihood:                -313.55
No. Observations:                 526   AIC:                             635.1
Df Residuals:                     522   BIC:                             652.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2844      0.104      2.729      0.007       0.080       0.489
educ           0.0920      0.007     12.555      0.000       0.078       0.106
exper          0.0041      0.002      2.391      0.017       0.001       0.008
tenure         0.0221      0.003      7.133      0.000       0.016       0.028
==============================================================================
Omnibus:                       11.534   Durbin-Watson:                   1.769
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               20.941
Skew:                           0.021   Prob(JB):                     2.84e-05
Kurtosis:                       3.977   Cond. No.                         135.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

4.2 Confidence Intervals

β^j±cse(β^j)\hat{\beta}_j \pm c \cdot se(\hat{\beta}_j)

Example 4.8: Model of R&D Expenditures

log(rd)=β0+β1log(sales)+β2profmarg+u\log(\text{rd}) = \beta_0 + \beta_1 \log(\text{sales}) + \beta_2 \text{profmarg} + u
rdchem = wool.data("rdchem")

# OLS regression:
reg = smf.ols(formula="np.log(rd) ~ np.log(sales) + profmarg", data=rdchem)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary(): 
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             np.log(rd)   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     162.2
Date:                Tue, 24 Sep 2024   Prob (F-statistic):           1.79e-16
Time:                        14:59:28   Log-Likelihood:                -22.511
No. Observations:                  32   AIC:                             51.02
Df Residuals:                      29   BIC:                             55.42
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -4.3783      0.468     -9.355      0.000      -5.335      -3.421
np.log(sales)     1.0842      0.060     18.012      0.000       0.961       1.207
profmarg          0.0217      0.013      1.694      0.101      -0.004       0.048
==============================================================================
Omnibus:                        0.670   Durbin-Watson:                   1.859
Prob(Omnibus):                  0.715   Jarque-Bera (JB):                0.671
Skew:                           0.308   Prob(JB):                        0.715
Kurtosis:                       2.649   Cond. No.                         70.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# 95% CI:
CI95 = results.conf_int(0.05)
print(f"CI95: \n{CI95}\n")
CI95: 
                      0         1
Intercept     -5.335478 -3.421068
np.log(sales)  0.961107  1.207332
profmarg      -0.004488  0.047799

# 99% CI:
CI99 = results.conf_int(0.01)
print(f"CI99: \n{CI99}\n")
CI99: 
                      0         1
Intercept     -5.668313 -3.088234
np.log(sales)  0.918299  1.250141
profmarg      -0.013578  0.056890

4.3 Linear Restrictions: FF Tests

log(salary)=β0+β1years+β2gamesyr+β3bavg+β4hrunsyr+β5rbisyr+u\log(\text{salary}) = \beta_0 + \beta_1 \text{years} + \beta_2 \text{gamesyr} + \beta_3 \text{bavg} + \beta_4 \text{hrunsyr} + \beta_5 \text{rbisyr} + u
F=SSRrSSRurSSRurnk1q=Rur2Rr21Rur2nk1qF = \frac{SSR_r - SSR_{ur}}{SSR_{ur}} \cdot \frac{n - k - 1}{q} = \frac{R^2_{ur} - R^2_r}{1 - R^2_{ur}} \cdot \frac{n - k - 1}{q}
mlb1 = wool.data("mlb1")
n = mlb1.shape[0]

# unrestricted OLS regression:
reg_ur = smf.ols(
    formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
    data=mlb1,
)
fit_ur = reg_ur.fit()
r2_ur = fit_ur.rsquared
print(f"r2_ur: {r2_ur}\n")
r2_ur: 0.6278028485187442

# restricted OLS regression:
reg_r = smf.ols(formula="np.log(salary) ~ years + gamesyr", data=mlb1)
fit_r = reg_r.fit()
r2_r = fit_r.rsquared
print(f"r2_r: {r2_r}\n")
r2_r: 0.5970716339066895

# F statistic:
fstat = (r2_ur - r2_r) / (1 - r2_ur) * (n - 6) / 3
print(f"fstat: {fstat}\n")
fstat: 9.550253521951914

# CV for alpha=1% using the F distribution with 3 and 347 d.f.:
cv = stats.f.ppf(1 - 0.01, 3, 347)
print(f"cv: {cv}\n")
cv: 3.838520048496057

# p value = 1-cdf of the appropriate F distribution:
fpval = 1 - stats.f.cdf(fstat, 3, 347)
print(f"fpval: {fpval}\n")
fpval: 4.473708139829391e-06

mlb1 = wool.data("mlb1")

# OLS regression:
reg = smf.ols(
    formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
    data=mlb1,
)
results = reg.fit()

# automated F test:
hypotheses = ["bavg = 0", "hrunsyr = 0", "rbisyr = 0"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")
fstat: 9.550253521951873

fpval: 4.4737081398389455e-06

mlb1 = wool.data("mlb1")

# OLS regression:
reg = smf.ols(
    formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
    data=mlb1,
)
results = reg.fit()

# automated F test:
hypotheses = ["bavg = 0", "hrunsyr = 2*rbisyr"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")
fstat: 0.5117822576247315

fpval: 0.5998780329146685