8. Heteroskedasticity

%pip install numpy pandas patsy statsmodels wooldridge -q

Note: you may need to restart the kernel to use updated packages.

import numpy as np
import pandas as pd
import patsy as pt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import wooldridge as wool

8.1 Heteroskedasticity-Robust Inference¶

Example 8.2: Heteroskedasticity-Robust Inference¶

gpa3 = wool.data("gpa3")

# define regression model:
reg = smf.ols(
    formula="cumgpa ~ sat + hsperc + tothrs + female + black + white",
    data=gpa3,
    subset=(gpa3["spring"] == 1),
)

# estimate default model (only for spring data):
results_default = reg.fit()

table_default = pd.DataFrame(
    {
        "b": round(results_default.params, 5),
        "se": round(results_default.bse, 5),
        "t": round(results_default.tvalues, 5),
        "pval": round(results_default.pvalues, 5),
    },
)
print(f"table_default: \n{table_default}\n")

table_default: 
                 b       se        t     pval
Intercept  1.47006  0.22980  6.39706  0.00000
sat        0.00114  0.00018  6.38850  0.00000
hsperc    -0.00857  0.00124 -6.90600  0.00000
tothrs     0.00250  0.00073  3.42551  0.00068
female     0.30343  0.05902  5.14117  0.00000
black     -0.12828  0.14737 -0.87049  0.38462
white     -0.05872  0.14099 -0.41650  0.67730

# estimate model with White SE (only for spring data):
results_white = reg.fit(cov_type="HC0")

table_white = pd.DataFrame(
    {
        "b": round(results_white.params, 5),
        "se": round(results_white.bse, 5),
        "t": round(results_white.tvalues, 5),
        "pval": round(results_white.pvalues, 5),
    },
)
print(f"table_white: \n{table_white}\n")

table_white: 
                 b       se        t     pval
Intercept  1.47006  0.21856  6.72615  0.00000
sat        0.00114  0.00019  6.01360  0.00000
hsperc    -0.00857  0.00140 -6.10008  0.00000
tothrs     0.00250  0.00073  3.41365  0.00064
female     0.30343  0.05857  5.18073  0.00000
black     -0.12828  0.11810 -1.08627  0.27736
white     -0.05872  0.11032 -0.53228  0.59453

# estimate model with refined White SE (only for spring data):
results_refined = reg.fit(cov_type="HC3")

table_refined = pd.DataFrame(
    {
        "b": round(results_refined.params, 5),
        "se": round(results_refined.bse, 5),
        "t": round(results_refined.tvalues, 5),
        "pval": round(results_refined.pvalues, 5),
    },
)
print(f"table_refined: \n{table_refined}\n")

table_refined: 
                 b       se        t     pval
Intercept  1.47006  0.22938  6.40885  0.00000
sat        0.00114  0.00020  5.84017  0.00000
hsperc    -0.00857  0.00144 -5.93407  0.00000
tothrs     0.00250  0.00075  3.34177  0.00083
female     0.30343  0.06004  5.05388  0.00000
black     -0.12828  0.12819 -1.00074  0.31695
white     -0.05872  0.12044 -0.48758  0.62585

gpa3 = wool.data("gpa3")

# definition of model and hypotheses:
reg = smf.ols(
    formula="cumgpa ~ sat + hsperc + tothrs + female + black + white",
    data=gpa3,
    subset=(gpa3["spring"] == 1),
)
hypotheses = ["black = 0", "white = 0"]

# F-Tests using different variance-covariance formulas:
# ususal VCOV:
results_default = reg.fit()
ftest_default = results_default.f_test(hypotheses)
fstat_default = ftest_default.statistic
fpval_default = ftest_default.pvalue
print(f"fstat_default: {fstat_default}\n")
print(f"fpval_default: {fpval_default}\n")

fstat_default: 0.6796041956073422

fpval_default: 0.5074683622584049

# refined White VCOV:
results_hc3 = reg.fit(cov_type="HC3")
ftest_hc3 = results_hc3.f_test(hypotheses)
fstat_hc3 = ftest_hc3.statistic
fpval_hc3 = ftest_hc3.pvalue
print(f"fstat_hc3: {fstat_hc3}\n")
print(f"fpval_hc3: {fpval_hc3}\n")

fstat_hc3: 0.6724692957656673

fpval_hc3: 0.5110883633440992

# classical White VCOV:
results_hc0 = reg.fit(cov_type="HC0")
ftest_hc0 = results_hc0.f_test(hypotheses)
fstat_hc0 = ftest_hc0.statistic
fpval_hc0 = ftest_hc0.pvalue
print(f"fstat_hc0: {fstat_hc0}\n")
print(f"fpval_hc0: {fpval_hc0}\n")

fstat_hc0: 0.7477969818036264

fpval_hc0: 0.4741442714738484

8.2 Heteroskedasticity Tests¶

Example 8.4: Heteroskedasticity in a Housing Price Equation¶

hprice1 = wool.data("hprice1")

# estimate model:
reg = smf.ols(formula="price ~ lotsize + sqrft + bdrms", data=hprice1)
results = reg.fit()
table_results = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table_results: \n{table_results}\n")

table_results: 
                 b       se       t    pval
Intercept -21.7703  29.4750 -0.7386  0.4622
lotsize     0.0021   0.0006  3.2201  0.0018
sqrft       0.1228   0.0132  9.2751  0.0000
bdrms      13.8525   9.0101  1.5374  0.1279

# automatic BP test (LM version):
y, X = pt.dmatrices(
    "price ~ lotsize + sqrft + bdrms",
    data=hprice1,
    return_type="dataframe",
)
result_bp_lm = sm.stats.diagnostic.het_breuschpagan(results.resid, X)
bp_lm_statistic = result_bp_lm[0]
bp_lm_pval = result_bp_lm[1]
print(f"bp_lm_statistic: {bp_lm_statistic}\n")
print(f"bp_lm_pval: {bp_lm_pval}\n")

bp_lm_statistic: 14.092385504350183

bp_lm_pval: 0.0027820595556891643

# manual BP test (F version):
hprice1["resid_sq"] = results.resid**2
reg_resid = smf.ols(formula="resid_sq ~ lotsize + sqrft + bdrms", data=hprice1)
results_resid = reg_resid.fit()
bp_F_statistic = results_resid.fvalue
bp_F_pval = results_resid.f_pvalue
print(f"bp_F_statistic: {bp_F_statistic}\n")
print(f"bp_F_pval: {bp_F_pval}\n")

bp_F_statistic: 5.338919363241395

bp_F_pval: 0.002047744420936124

Example 8.5: BP and White test in the Log Housing Price Equation¶

hprice1 = wool.data("hprice1")

# estimate model:
reg = smf.ols(
    formula="np.log(price) ~ np.log(lotsize) + np.log(sqrft) + bdrms",
    data=hprice1,
)
results = reg.fit()

# BP test:
y, X_bp = pt.dmatrices(
    "np.log(price) ~ np.log(lotsize) + np.log(sqrft) + bdrms",
    data=hprice1,
    return_type="dataframe",
)
result_bp = sm.stats.diagnostic.het_breuschpagan(results.resid, X_bp)
bp_statistic = result_bp[0]
bp_pval = result_bp[1]
print(f"bp_statistic: {bp_statistic}\n")
print(f"bp_pval: {bp_pval}\n")

bp_statistic: 4.223245741805276

bp_pval: 0.23834482631492962

# White test:
X_wh = pd.DataFrame(
    {
        "const": 1,
        "fitted_reg": results.fittedvalues,
        "fitted_reg_sq": results.fittedvalues**2,
    },
)
result_white = sm.stats.diagnostic.het_breuschpagan(results.resid, X_wh)
white_statistic = result_white[0]
white_pval = result_white[1]
print(f"white_statistic: {white_statistic}\n")
print(f"white_pval: {white_pval}\n")

white_statistic: 3.4472865468746834

white_pval: 0.17841494794136223

8.3 Weighted Least Squares¶

Example 8.6: Financial Wealth Equation¶

k401ksubs = wool.data("401ksubs")

# subsetting data:
k401ksubs_sub = k401ksubs[k401ksubs["fsize"] == 1]

# OLS (only for singles, i.e. 'fsize'==1):
reg_ols = smf.ols(
    formula="nettfa ~ inc + I((age-25)**2) + male + e401k",
    data=k401ksubs_sub,
)
results_ols = reg_ols.fit(cov_type="HC0")

# print regression table:
table_ols = pd.DataFrame(
    {
        "b": round(results_ols.params, 4),
        "se": round(results_ols.bse, 4),
        "t": round(results_ols.tvalues, 4),
        "pval": round(results_ols.pvalues, 4),
    },
)
print(f"table_ols: \n{table_ols}\n")

table_ols: 
                          b      se       t    pval
Intercept          -20.9850  3.4909 -6.0114  0.0000
inc                  0.7706  0.0994  7.7486  0.0000
I((age - 25) ** 2)   0.0251  0.0043  5.7912  0.0000
male                 2.4779  2.0558  1.2053  0.2281
e401k                6.8862  2.2837  3.0153  0.0026

# WLS:
wls_weight = list(1 / k401ksubs_sub["inc"])
reg_wls = smf.wls(
    formula="nettfa ~ inc + I((age-25)**2) + male + e401k",
    weights=wls_weight,
    data=k401ksubs_sub,
)
results_wls = reg_wls.fit()

# print regression table:
table_wls = pd.DataFrame(
    {
        "b": round(results_wls.params, 4),
        "se": round(results_wls.bse, 4),
        "t": round(results_wls.tvalues, 4),
        "pval": round(results_wls.pvalues, 4),
    },
)
print(f"table_wls: \n{table_wls}\n")

table_wls: 
                          b      se        t    pval
Intercept          -16.7025  1.9580  -8.5304  0.0000
inc                  0.7404  0.0643  11.5140  0.0000
I((age - 25) ** 2)   0.0175  0.0019   9.0796  0.0000
male                 1.8405  1.5636   1.1771  0.2393
e401k                5.1883  1.7034   3.0458  0.0024

k401ksubs = wool.data("401ksubs")

# subsetting data:
k401ksubs_sub = k401ksubs[k401ksubs["fsize"] == 1]

# WLS:
wls_weight = list(1 / k401ksubs_sub["inc"])
reg_wls = smf.wls(
    formula="nettfa ~ inc + I((age-25)**2) + male + e401k",
    weights=wls_weight,
    data=k401ksubs_sub,
)

# non-robust (default) results:
results_wls = reg_wls.fit()
table_default = pd.DataFrame(
    {
        "b": round(results_wls.params, 4),
        "se": round(results_wls.bse, 4),
        "t": round(results_wls.tvalues, 4),
        "pval": round(results_wls.pvalues, 4),
    },
)
print(f"table_default: \n{table_default}\n")

table_default: 
                          b      se        t    pval
Intercept          -16.7025  1.9580  -8.5304  0.0000
inc                  0.7404  0.0643  11.5140  0.0000
I((age - 25) ** 2)   0.0175  0.0019   9.0796  0.0000
male                 1.8405  1.5636   1.1771  0.2393
e401k                5.1883  1.7034   3.0458  0.0024

# robust results (Refined White SE):
results_white = reg_wls.fit(cov_type="HC3")
table_white = pd.DataFrame(
    {
        "b": round(results_white.params, 4),
        "se": round(results_white.bse, 4),
        "t": round(results_white.tvalues, 4),
        "pval": round(results_white.pvalues, 4),
    },
)
print(f"table_white: \n{table_white}\n")

table_white: 
                          b      se       t    pval
Intercept          -16.7025  2.2482 -7.4292  0.0000
inc                  0.7404  0.0752  9.8403  0.0000
I((age - 25) ** 2)   0.0175  0.0026  6.7650  0.0000
male                 1.8405  1.3132  1.4015  0.1611
e401k                5.1883  1.5743  3.2955  0.0010

Example 8.7: Demand for Cigarettes¶

smoke = wool.data("smoke")

# OLS:
reg_ols = smf.ols(
    formula="cigs ~ np.log(income) + np.log(cigpric) +"
    "educ + age + I(age**2) + restaurn",
    data=smoke,
)
results_ols = reg_ols.fit()
table_ols = pd.DataFrame(
    {
        "b": round(results_ols.params, 4),
        "se": round(results_ols.bse, 4),
        "t": round(results_ols.tvalues, 4),
        "pval": round(results_ols.pvalues, 4),
    },
)
print(f"table_ols: \n{table_ols}\n")

table_ols: 
                      b       se       t    pval
Intercept       -3.6398  24.0787 -0.1512  0.8799
np.log(income)   0.8803   0.7278  1.2095  0.2268
np.log(cigpric) -0.7509   5.7733 -0.1301  0.8966
educ            -0.5015   0.1671 -3.0016  0.0028
age              0.7707   0.1601  4.8132  0.0000
I(age ** 2)     -0.0090   0.0017 -5.1765  0.0000
restaurn        -2.8251   1.1118 -2.5410  0.0112

# BP test:
y, X = pt.dmatrices(
    "cigs ~ np.log(income) + np.log(cigpric) + educ +age + I(age**2) + restaurn",
    data=smoke,
    return_type="dataframe",
)
result_bp = sm.stats.diagnostic.het_breuschpagan(results_ols.resid, X)
bp_statistic = result_bp[0]
bp_pval = result_bp[1]
print(f"bp_statistic: {bp_statistic}\n")
print(f"bp_pval: {bp_pval}\n")

bp_statistic: 32.25841908120121

bp_pval: 1.4557794830278942e-05

# FGLS (estimation of the variance function):
smoke["logu2"] = np.log(results_ols.resid**2)
reg_fgls = smf.ols(
    formula="logu2 ~ np.log(income) + np.log(cigpric) +"
    "educ + age + I(age**2) + restaurn",
    data=smoke,
)
results_fgls = reg_fgls.fit()
table_fgls = pd.DataFrame(
    {
        "b": round(results_fgls.params, 4),
        "se": round(results_fgls.bse, 4),
        "t": round(results_fgls.tvalues, 4),
        "pval": round(results_fgls.pvalues, 4),
    },
)
print(f"table_fgls: \n{table_fgls}\n")

table_fgls: 
                      b      se        t    pval
Intercept       -1.9207  2.5630  -0.7494  0.4538
np.log(income)   0.2915  0.0775   3.7634  0.0002
np.log(cigpric)  0.1954  0.6145   0.3180  0.7506
educ            -0.0797  0.0178  -4.4817  0.0000
age              0.2040  0.0170  11.9693  0.0000
I(age ** 2)     -0.0024  0.0002 -12.8931  0.0000
restaurn        -0.6270  0.1183  -5.2982  0.0000

# FGLS (WLS):
wls_weight = list(1 / np.exp(results_fgls.fittedvalues))
reg_wls = smf.wls(
    formula="cigs ~ np.log(income) + np.log(cigpric) +"
    "educ + age + I(age**2) + restaurn",
    weights=wls_weight,
    data=smoke,
)
results_wls = reg_wls.fit()
table_wls = pd.DataFrame(
    {
        "b": round(results_wls.params, 4),
        "se": round(results_wls.bse, 4),
        "t": round(results_wls.tvalues, 4),
        "pval": round(results_wls.pvalues, 4),
    },
)
print(f"table_wls: \n{table_wls}\n")

table_wls: 
                      b       se       t    pval
Intercept        5.6355  17.8031  0.3165  0.7517
np.log(income)   1.2952   0.4370  2.9639  0.0031
np.log(cigpric) -2.9403   4.4601 -0.6592  0.5099
educ            -0.4634   0.1202 -3.8570  0.0001
age              0.4819   0.0968  4.9784  0.0000
I(age ** 2)     -0.0056   0.0009 -5.9897  0.0000
restaurn        -3.4611   0.7955 -4.3508  0.0000

merino

7. Multiple Regression Analysis with Qualitative Regressors

merino

9. Specification and Data Issues