%pip install numpy pandas patsy statsmodels wooldridge -q
Note: you may need to restart the kernel to use updated packages.
import numpy as np
import pandas as pd
import patsy as pt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import wooldridge as wool
gpa3 = wool.data("gpa3")
# define regression model:
reg = smf.ols(
formula="cumgpa ~ sat + hsperc + tothrs + female + black + white",
data=gpa3,
subset=(gpa3["spring"] == 1),
)
# estimate default model (only for spring data):
results_default = reg.fit()
table_default = pd.DataFrame(
{
"b": round(results_default.params, 5),
"se": round(results_default.bse, 5),
"t": round(results_default.tvalues, 5),
"pval": round(results_default.pvalues, 5),
},
)
print(f"table_default: \n{table_default}\n")
table_default:
b se t pval
Intercept 1.47006 0.22980 6.39706 0.00000
sat 0.00114 0.00018 6.38850 0.00000
hsperc -0.00857 0.00124 -6.90600 0.00000
tothrs 0.00250 0.00073 3.42551 0.00068
female 0.30343 0.05902 5.14117 0.00000
black -0.12828 0.14737 -0.87049 0.38462
white -0.05872 0.14099 -0.41650 0.67730
# estimate model with White SE (only for spring data):
results_white = reg.fit(cov_type="HC0")
table_white = pd.DataFrame(
{
"b": round(results_white.params, 5),
"se": round(results_white.bse, 5),
"t": round(results_white.tvalues, 5),
"pval": round(results_white.pvalues, 5),
},
)
print(f"table_white: \n{table_white}\n")
table_white:
b se t pval
Intercept 1.47006 0.21856 6.72615 0.00000
sat 0.00114 0.00019 6.01360 0.00000
hsperc -0.00857 0.00140 -6.10008 0.00000
tothrs 0.00250 0.00073 3.41365 0.00064
female 0.30343 0.05857 5.18073 0.00000
black -0.12828 0.11810 -1.08627 0.27736
white -0.05872 0.11032 -0.53228 0.59453
# estimate model with refined White SE (only for spring data):
results_refined = reg.fit(cov_type="HC3")
table_refined = pd.DataFrame(
{
"b": round(results_refined.params, 5),
"se": round(results_refined.bse, 5),
"t": round(results_refined.tvalues, 5),
"pval": round(results_refined.pvalues, 5),
},
)
print(f"table_refined: \n{table_refined}\n")
table_refined:
b se t pval
Intercept 1.47006 0.22938 6.40885 0.00000
sat 0.00114 0.00020 5.84017 0.00000
hsperc -0.00857 0.00144 -5.93407 0.00000
tothrs 0.00250 0.00075 3.34177 0.00083
female 0.30343 0.06004 5.05388 0.00000
black -0.12828 0.12819 -1.00074 0.31695
white -0.05872 0.12044 -0.48758 0.62585
gpa3 = wool.data("gpa3")
# definition of model and hypotheses:
reg = smf.ols(
formula="cumgpa ~ sat + hsperc + tothrs + female + black + white",
data=gpa3,
subset=(gpa3["spring"] == 1),
)
hypotheses = ["black = 0", "white = 0"]
# F-Tests using different variance-covariance formulas:
# ususal VCOV:
results_default = reg.fit()
ftest_default = results_default.f_test(hypotheses)
fstat_default = ftest_default.statistic
fpval_default = ftest_default.pvalue
print(f"fstat_default: {fstat_default}\n")
print(f"fpval_default: {fpval_default}\n")
fstat_default: 0.6796041956073422
fpval_default: 0.5074683622584049
# refined White VCOV:
results_hc3 = reg.fit(cov_type="HC3")
ftest_hc3 = results_hc3.f_test(hypotheses)
fstat_hc3 = ftest_hc3.statistic
fpval_hc3 = ftest_hc3.pvalue
print(f"fstat_hc3: {fstat_hc3}\n")
print(f"fpval_hc3: {fpval_hc3}\n")
fstat_hc3: 0.6724692957656673
fpval_hc3: 0.5110883633440992
# classical White VCOV:
results_hc0 = reg.fit(cov_type="HC0")
ftest_hc0 = results_hc0.f_test(hypotheses)
fstat_hc0 = ftest_hc0.statistic
fpval_hc0 = ftest_hc0.pvalue
print(f"fstat_hc0: {fstat_hc0}\n")
print(f"fpval_hc0: {fpval_hc0}\n")
fstat_hc0: 0.7477969818036264
fpval_hc0: 0.4741442714738484
hprice1 = wool.data("hprice1")
# estimate model:
reg = smf.ols(formula="price ~ lotsize + sqrft + bdrms", data=hprice1)
results = reg.fit()
table_results = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table_results: \n{table_results}\n")
table_results:
b se t pval
Intercept -21.7703 29.4750 -0.7386 0.4622
lotsize 0.0021 0.0006 3.2201 0.0018
sqrft 0.1228 0.0132 9.2751 0.0000
bdrms 13.8525 9.0101 1.5374 0.1279
# automatic BP test (LM version):
y, X = pt.dmatrices(
"price ~ lotsize + sqrft + bdrms",
data=hprice1,
return_type="dataframe",
)
result_bp_lm = sm.stats.diagnostic.het_breuschpagan(results.resid, X)
bp_lm_statistic = result_bp_lm[0]
bp_lm_pval = result_bp_lm[1]
print(f"bp_lm_statistic: {bp_lm_statistic}\n")
print(f"bp_lm_pval: {bp_lm_pval}\n")
bp_lm_statistic: 14.092385504350183
bp_lm_pval: 0.0027820595556891643
# manual BP test (F version):
hprice1["resid_sq"] = results.resid**2
reg_resid = smf.ols(formula="resid_sq ~ lotsize + sqrft + bdrms", data=hprice1)
results_resid = reg_resid.fit()
bp_F_statistic = results_resid.fvalue
bp_F_pval = results_resid.f_pvalue
print(f"bp_F_statistic: {bp_F_statistic}\n")
print(f"bp_F_pval: {bp_F_pval}\n")
bp_F_statistic: 5.338919363241395
bp_F_pval: 0.002047744420936124
Example 8.5: BP and White test in the Log Housing Price Equation¶
hprice1 = wool.data("hprice1")
# estimate model:
reg = smf.ols(
formula="np.log(price) ~ np.log(lotsize) + np.log(sqrft) + bdrms",
data=hprice1,
)
results = reg.fit()
# BP test:
y, X_bp = pt.dmatrices(
"np.log(price) ~ np.log(lotsize) + np.log(sqrft) + bdrms",
data=hprice1,
return_type="dataframe",
)
result_bp = sm.stats.diagnostic.het_breuschpagan(results.resid, X_bp)
bp_statistic = result_bp[0]
bp_pval = result_bp[1]
print(f"bp_statistic: {bp_statistic}\n")
print(f"bp_pval: {bp_pval}\n")
bp_statistic: 4.223245741805276
bp_pval: 0.23834482631492962
# White test:
X_wh = pd.DataFrame(
{
"const": 1,
"fitted_reg": results.fittedvalues,
"fitted_reg_sq": results.fittedvalues**2,
},
)
result_white = sm.stats.diagnostic.het_breuschpagan(results.resid, X_wh)
white_statistic = result_white[0]
white_pval = result_white[1]
print(f"white_statistic: {white_statistic}\n")
print(f"white_pval: {white_pval}\n")
white_statistic: 3.4472865468746834
white_pval: 0.17841494794136223
k401ksubs = wool.data("401ksubs")
# subsetting data:
k401ksubs_sub = k401ksubs[k401ksubs["fsize"] == 1]
# OLS (only for singles, i.e. 'fsize'==1):
reg_ols = smf.ols(
formula="nettfa ~ inc + I((age-25)**2) + male + e401k",
data=k401ksubs_sub,
)
results_ols = reg_ols.fit(cov_type="HC0")
# print regression table:
table_ols = pd.DataFrame(
{
"b": round(results_ols.params, 4),
"se": round(results_ols.bse, 4),
"t": round(results_ols.tvalues, 4),
"pval": round(results_ols.pvalues, 4),
},
)
print(f"table_ols: \n{table_ols}\n")
table_ols:
b se t pval
Intercept -20.9850 3.4909 -6.0114 0.0000
inc 0.7706 0.0994 7.7486 0.0000
I((age - 25) ** 2) 0.0251 0.0043 5.7912 0.0000
male 2.4779 2.0558 1.2053 0.2281
e401k 6.8862 2.2837 3.0153 0.0026
# WLS:
wls_weight = list(1 / k401ksubs_sub["inc"])
reg_wls = smf.wls(
formula="nettfa ~ inc + I((age-25)**2) + male + e401k",
weights=wls_weight,
data=k401ksubs_sub,
)
results_wls = reg_wls.fit()
# print regression table:
table_wls = pd.DataFrame(
{
"b": round(results_wls.params, 4),
"se": round(results_wls.bse, 4),
"t": round(results_wls.tvalues, 4),
"pval": round(results_wls.pvalues, 4),
},
)
print(f"table_wls: \n{table_wls}\n")
table_wls:
b se t pval
Intercept -16.7025 1.9580 -8.5304 0.0000
inc 0.7404 0.0643 11.5140 0.0000
I((age - 25) ** 2) 0.0175 0.0019 9.0796 0.0000
male 1.8405 1.5636 1.1771 0.2393
e401k 5.1883 1.7034 3.0458 0.0024
k401ksubs = wool.data("401ksubs")
# subsetting data:
k401ksubs_sub = k401ksubs[k401ksubs["fsize"] == 1]
# WLS:
wls_weight = list(1 / k401ksubs_sub["inc"])
reg_wls = smf.wls(
formula="nettfa ~ inc + I((age-25)**2) + male + e401k",
weights=wls_weight,
data=k401ksubs_sub,
)
# non-robust (default) results:
results_wls = reg_wls.fit()
table_default = pd.DataFrame(
{
"b": round(results_wls.params, 4),
"se": round(results_wls.bse, 4),
"t": round(results_wls.tvalues, 4),
"pval": round(results_wls.pvalues, 4),
},
)
print(f"table_default: \n{table_default}\n")
table_default:
b se t pval
Intercept -16.7025 1.9580 -8.5304 0.0000
inc 0.7404 0.0643 11.5140 0.0000
I((age - 25) ** 2) 0.0175 0.0019 9.0796 0.0000
male 1.8405 1.5636 1.1771 0.2393
e401k 5.1883 1.7034 3.0458 0.0024
# robust results (Refined White SE):
results_white = reg_wls.fit(cov_type="HC3")
table_white = pd.DataFrame(
{
"b": round(results_white.params, 4),
"se": round(results_white.bse, 4),
"t": round(results_white.tvalues, 4),
"pval": round(results_white.pvalues, 4),
},
)
print(f"table_white: \n{table_white}\n")
table_white:
b se t pval
Intercept -16.7025 2.2482 -7.4292 0.0000
inc 0.7404 0.0752 9.8403 0.0000
I((age - 25) ** 2) 0.0175 0.0026 6.7650 0.0000
male 1.8405 1.3132 1.4015 0.1611
e401k 5.1883 1.5743 3.2955 0.0010
Example 8.7: Demand for Cigarettes¶
smoke = wool.data("smoke")
# OLS:
reg_ols = smf.ols(
formula="cigs ~ np.log(income) + np.log(cigpric) +"
"educ + age + I(age**2) + restaurn",
data=smoke,
)
results_ols = reg_ols.fit()
table_ols = pd.DataFrame(
{
"b": round(results_ols.params, 4),
"se": round(results_ols.bse, 4),
"t": round(results_ols.tvalues, 4),
"pval": round(results_ols.pvalues, 4),
},
)
print(f"table_ols: \n{table_ols}\n")
table_ols:
b se t pval
Intercept -3.6398 24.0787 -0.1512 0.8799
np.log(income) 0.8803 0.7278 1.2095 0.2268
np.log(cigpric) -0.7509 5.7733 -0.1301 0.8966
educ -0.5015 0.1671 -3.0016 0.0028
age 0.7707 0.1601 4.8132 0.0000
I(age ** 2) -0.0090 0.0017 -5.1765 0.0000
restaurn -2.8251 1.1118 -2.5410 0.0112
# BP test:
y, X = pt.dmatrices(
"cigs ~ np.log(income) + np.log(cigpric) + educ +age + I(age**2) + restaurn",
data=smoke,
return_type="dataframe",
)
result_bp = sm.stats.diagnostic.het_breuschpagan(results_ols.resid, X)
bp_statistic = result_bp[0]
bp_pval = result_bp[1]
print(f"bp_statistic: {bp_statistic}\n")
print(f"bp_pval: {bp_pval}\n")
bp_statistic: 32.25841908120121
bp_pval: 1.4557794830278942e-05
# FGLS (estimation of the variance function):
smoke["logu2"] = np.log(results_ols.resid**2)
reg_fgls = smf.ols(
formula="logu2 ~ np.log(income) + np.log(cigpric) +"
"educ + age + I(age**2) + restaurn",
data=smoke,
)
results_fgls = reg_fgls.fit()
table_fgls = pd.DataFrame(
{
"b": round(results_fgls.params, 4),
"se": round(results_fgls.bse, 4),
"t": round(results_fgls.tvalues, 4),
"pval": round(results_fgls.pvalues, 4),
},
)
print(f"table_fgls: \n{table_fgls}\n")
table_fgls:
b se t pval
Intercept -1.9207 2.5630 -0.7494 0.4538
np.log(income) 0.2915 0.0775 3.7634 0.0002
np.log(cigpric) 0.1954 0.6145 0.3180 0.7506
educ -0.0797 0.0178 -4.4817 0.0000
age 0.2040 0.0170 11.9693 0.0000
I(age ** 2) -0.0024 0.0002 -12.8931 0.0000
restaurn -0.6270 0.1183 -5.2982 0.0000
# FGLS (WLS):
wls_weight = list(1 / np.exp(results_fgls.fittedvalues))
reg_wls = smf.wls(
formula="cigs ~ np.log(income) + np.log(cigpric) +"
"educ + age + I(age**2) + restaurn",
weights=wls_weight,
data=smoke,
)
results_wls = reg_wls.fit()
table_wls = pd.DataFrame(
{
"b": round(results_wls.params, 4),
"se": round(results_wls.bse, 4),
"t": round(results_wls.tvalues, 4),
"pval": round(results_wls.pvalues, 4),
},
)
print(f"table_wls: \n{table_wls}\n")
table_wls:
b se t pval
Intercept 5.6355 17.8031 0.3165 0.7517
np.log(income) 1.2952 0.4370 2.9639 0.0031
np.log(cigpric) -2.9403 4.4601 -0.6592 0.5099
educ -0.4634 0.1202 -3.8570 0.0001
age 0.4819 0.0968 4.9784 0.0000
I(age ** 2) -0.0056 0.0009 -5.9897 0.0000
restaurn -3.4611 0.7955 -4.3508 0.0000