%pip install numpy statsmodels wooldridge scipy -q
Note: you may need to restart the kernel to use updated packages.
import numpy as np
import statsmodels.formula.api as smf
import wooldridge as wool
from scipy import stats
# CV for alpha=5% and 1% using the t distribution with 137 d.f.:
alpha = np.array([0.05, 0.01])
cv_t = stats.t.ppf(1 - alpha / 2, 137)
print(f"cv_t: {cv_t}\n")
cv_t: [1.97743121 2.61219198]
# CV for alpha=5% and 1% using the normal approximation:
cv_n = stats.norm.ppf(1 - alpha / 2)
print(f"cv_n: {cv_n}\n")
cv_n: [1.95996398 2.5758293 ]
gpa1 = wool.data("gpa1")
# store and display results:
reg = smf.ols(formula="colGPA ~ hsGPA + ACT + skipped", data=gpa1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: colGPA R-squared: 0.234
Model: OLS Adj. R-squared: 0.217
Method: Least Squares F-statistic: 13.92
Date: Tue, 24 Sep 2024 Prob (F-statistic): 5.65e-08
Time: 14:59:27 Log-Likelihood: -41.501
No. Observations: 141 AIC: 91.00
Df Residuals: 137 BIC: 102.8
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 1.3896 0.332 4.191 0.000 0.734 2.045
hsGPA 0.4118 0.094 4.396 0.000 0.227 0.597
ACT 0.0147 0.011 1.393 0.166 -0.006 0.036
skipped -0.0831 0.026 -3.197 0.002 -0.135 -0.032
==============================================================================
Omnibus: 1.917 Durbin-Watson: 1.881
Prob(Omnibus): 0.383 Jarque-Bera (JB): 1.636
Skew: 0.125 Prob(JB): 0.441
Kurtosis: 2.535 Cond. No. 300.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# manually confirm the formulas, i.e. extract coefficients and SE:
b = results.params
se = results.bse
# reproduce t statistic:
tstat = b / se
print(f"tstat: \n{tstat}\n")
# reproduce p value:
pval = 2 * stats.t.cdf(-abs(tstat), 137)
print(f"pval: \n{pval}\n")
tstat:
Intercept 4.191039
hsGPA 4.396260
ACT 1.393319
skipped -3.196840
dtype: float64
pval:
[4.95026897e-05 2.19205015e-05 1.65779902e-01 1.72543113e-03]
# CV for alpha=5% and 1% using the t distribution with 522 d.f.:
alpha = np.array([0.05, 0.01])
cv_t = stats.t.ppf(1 - alpha, 522)
print(f"cv_t: {cv_t}\n")
cv_t: [1.64777794 2.33351273]
# CV for alpha=5% and 1% using the normal approximation:
cv_n = stats.norm.ppf(1 - alpha)
print(f"cv_n: {cv_n}\n")
cv_n: [1.64485363 2.32634787]
wage1 = wool.data("wage1")
reg = smf.ols(formula="np.log(wage) ~ educ + exper + tenure", data=wage1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: np.log(wage) R-squared: 0.316
Model: OLS Adj. R-squared: 0.312
Method: Least Squares F-statistic: 80.39
Date: Tue, 24 Sep 2024 Prob (F-statistic): 9.13e-43
Time: 14:59:28 Log-Likelihood: -313.55
No. Observations: 526 AIC: 635.1
Df Residuals: 522 BIC: 652.2
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.2844 0.104 2.729 0.007 0.080 0.489
educ 0.0920 0.007 12.555 0.000 0.078 0.106
exper 0.0041 0.002 2.391 0.017 0.001 0.008
tenure 0.0221 0.003 7.133 0.000 0.016 0.028
==============================================================================
Omnibus: 11.534 Durbin-Watson: 1.769
Prob(Omnibus): 0.003 Jarque-Bera (JB): 20.941
Skew: 0.021 Prob(JB): 2.84e-05
Kurtosis: 3.977 Cond. No. 135.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
rdchem = wool.data("rdchem")
# OLS regression:
reg = smf.ols(formula="np.log(rd) ~ np.log(sales) + profmarg", data=rdchem)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")
results.summary():
OLS Regression Results
==============================================================================
Dep. Variable: np.log(rd) R-squared: 0.918
Model: OLS Adj. R-squared: 0.912
Method: Least Squares F-statistic: 162.2
Date: Tue, 24 Sep 2024 Prob (F-statistic): 1.79e-16
Time: 14:59:28 Log-Likelihood: -22.511
No. Observations: 32 AIC: 51.02
Df Residuals: 29 BIC: 55.42
Df Model: 2
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept -4.3783 0.468 -9.355 0.000 -5.335 -3.421
np.log(sales) 1.0842 0.060 18.012 0.000 0.961 1.207
profmarg 0.0217 0.013 1.694 0.101 -0.004 0.048
==============================================================================
Omnibus: 0.670 Durbin-Watson: 1.859
Prob(Omnibus): 0.715 Jarque-Bera (JB): 0.671
Skew: 0.308 Prob(JB): 0.715
Kurtosis: 2.649 Cond. No. 70.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# 95% CI:
CI95 = results.conf_int(0.05)
print(f"CI95: \n{CI95}\n")
CI95:
0 1
Intercept -5.335478 -3.421068
np.log(sales) 0.961107 1.207332
profmarg -0.004488 0.047799
# 99% CI:
CI99 = results.conf_int(0.01)
print(f"CI99: \n{CI99}\n")
CI99:
0 1
Intercept -5.668313 -3.088234
np.log(sales) 0.918299 1.250141
profmarg -0.013578 0.056890
mlb1 = wool.data("mlb1")
n = mlb1.shape[0]
# unrestricted OLS regression:
reg_ur = smf.ols(
formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
data=mlb1,
)
fit_ur = reg_ur.fit()
r2_ur = fit_ur.rsquared
print(f"r2_ur: {r2_ur}\n")
r2_ur: 0.6278028485187442
# restricted OLS regression:
reg_r = smf.ols(formula="np.log(salary) ~ years + gamesyr", data=mlb1)
fit_r = reg_r.fit()
r2_r = fit_r.rsquared
print(f"r2_r: {r2_r}\n")
r2_r: 0.5970716339066895
# F statistic:
fstat = (r2_ur - r2_r) / (1 - r2_ur) * (n - 6) / 3
print(f"fstat: {fstat}\n")
fstat: 9.550253521951914
# CV for alpha=1% using the F distribution with 3 and 347 d.f.:
cv = stats.f.ppf(1 - 0.01, 3, 347)
print(f"cv: {cv}\n")
cv: 3.838520048496057
# p value = 1-cdf of the appropriate F distribution:
fpval = 1 - stats.f.cdf(fstat, 3, 347)
print(f"fpval: {fpval}\n")
fpval: 4.473708139829391e-06
mlb1 = wool.data("mlb1")
# OLS regression:
reg = smf.ols(
formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
data=mlb1,
)
results = reg.fit()
# automated F test:
hypotheses = ["bavg = 0", "hrunsyr = 0", "rbisyr = 0"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue
print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")
fstat: 9.550253521951873
fpval: 4.4737081398389455e-06
mlb1 = wool.data("mlb1")
# OLS regression:
reg = smf.ols(
formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
data=mlb1,
)
results = reg.fit()
# automated F test:
hypotheses = ["bavg = 0", "hrunsyr = 2*rbisyr"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue
print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")
fstat: 0.5117822576247315
fpval: 0.5998780329146685