%pip install matplotlib numpy pandas statsmodels wooldridge -q
Note: you may need to restart the kernel to use updated packages.
import numpy as np # noqa
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import wooldridge as wool
wage1 = wool.data("wage1")
reg = smf.ols(formula="wage ~ female + educ + exper + tenure", data=wage1)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept -1.5679 0.7246 -2.1640 0.0309
female -1.8109 0.2648 -6.8379 0.0000
educ 0.5715 0.0493 11.5836 0.0000
exper 0.0254 0.0116 2.1951 0.0286
tenure 0.1410 0.0212 6.6632 0.0000
Example 7.6: Log Hourly Wage Equation¶
wage1 = wool.data("wage1")
reg = smf.ols(
formula="np.log(wage) ~ married*female + educ + exper +"
"I(exper**2) + tenure + I(tenure**2)",
data=wage1,
)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept 0.3214 0.1000 3.2135 0.0014
married 0.2127 0.0554 3.8419 0.0001
female -0.1104 0.0557 -1.9797 0.0483
married:female -0.3006 0.0718 -4.1885 0.0000
educ 0.0789 0.0067 11.7873 0.0000
exper 0.0268 0.0052 5.1118 0.0000
I(exper ** 2) -0.0005 0.0001 -4.8471 0.0000
tenure 0.0291 0.0068 4.3016 0.0000
I(tenure ** 2) -0.0005 0.0002 -2.3056 0.0215
7.2 Boolean variables¶
wage1 = wool.data("wage1")
# regression with boolean variable:
wage1["isfemale"] = wage1["female"] == 1
reg = smf.ols(formula="wage ~ isfemale + educ + exper + tenure", data=wage1)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept -1.5679 0.7246 -2.1640 0.0309
isfemale[T.True] -1.8109 0.2648 -6.8379 0.0000
educ 0.5715 0.0493 11.5836 0.0000
exper 0.0254 0.0116 2.1951 0.0286
tenure 0.1410 0.0212 6.6632 0.0000
7.3 Categorical Variables¶
CPS1985 = pd.read_csv("../data/CPS1985.csv")
# rename variable to make outputs more compact:
CPS1985["oc"] = CPS1985["occupation"]
# table of categories and frequencies for two categorical variables:
freq_gender = pd.crosstab(CPS1985["gender"], columns="count")
print(f"freq_gender: \n{freq_gender}\n")
freq_occupation = pd.crosstab(CPS1985["oc"], columns="count")
print(f"freq_occupation: \n{freq_occupation}\n")
freq_gender:
col_0 count
gender
female 245
male 289
freq_occupation:
col_0 count
oc
management 55
office 97
sales 38
services 83
technical 105
worker 156
# directly using categorical variables in regression formula:
reg = smf.ols(
formula="np.log(wage) ~ education +experience + C(gender) + C(oc)",
data=CPS1985,
)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept 0.9050 0.1717 5.2718 0.0000
C(gender)[T.male] 0.2238 0.0423 5.2979 0.0000
C(oc)[T.office] -0.2073 0.0776 -2.6699 0.0078
C(oc)[T.sales] -0.3601 0.0936 -3.8455 0.0001
C(oc)[T.services] -0.3626 0.0818 -4.4305 0.0000
C(oc)[T.technical] -0.0101 0.0740 -0.1363 0.8916
C(oc)[T.worker] -0.1525 0.0763 -1.9981 0.0462
education 0.0759 0.0101 7.5449 0.0000
experience 0.0119 0.0017 7.0895 0.0000
# rerun regression with different reference category:
reg_newref = smf.ols(
formula="np.log(wage) ~ education + experience + "
'C(gender, Treatment("male")) + '
'C(oc, Treatment("technical"))',
data=CPS1985,
)
results_newref = reg_newref.fit()
# print results:
table_newref = pd.DataFrame(
{
"b": round(results_newref.params, 4),
"se": round(results_newref.bse, 4),
"t": round(results_newref.tvalues, 4),
"pval": round(results_newref.pvalues, 4),
},
)
print(f"table_newref: \n{table_newref}\n")
table_newref:
b se t pval
Intercept 1.1187 0.1765 6.3393 0.0000
C(gender, Treatment("male"))[T.female] -0.2238 0.0423 -5.2979 0.0000
C(oc, Treatment("technical"))[T.management] 0.0101 0.0740 0.1363 0.8916
C(oc, Treatment("technical"))[T.office] -0.1972 0.0678 -2.9082 0.0038
C(oc, Treatment("technical"))[T.sales] -0.3500 0.0863 -4.0541 0.0001
C(oc, Treatment("technical"))[T.services] -0.3525 0.0750 -4.7030 0.0000
C(oc, Treatment("technical"))[T.worker] -0.1425 0.0705 -2.0218 0.0437
education 0.0759 0.0101 7.5449 0.0000
experience 0.0119 0.0017 7.0895 0.0000
7.3.1 ANOVA Tables¶
CPS1985 = pd.read_csv("../data/CPS1985.csv")
# run regression:
reg = smf.ols(
formula="np.log(wage) ~ education + experience + gender + occupation",
data=CPS1985,
)
results = reg.fit()
# print regression table:
table_reg = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table_reg: \n{table_reg}\n")
table_reg:
b se t pval
Intercept 0.9050 0.1717 5.2718 0.0000
gender[T.male] 0.2238 0.0423 5.2979 0.0000
occupation[T.office] -0.2073 0.0776 -2.6699 0.0078
occupation[T.sales] -0.3601 0.0936 -3.8455 0.0001
occupation[T.services] -0.3626 0.0818 -4.4305 0.0000
occupation[T.technical] -0.0101 0.0740 -0.1363 0.8916
occupation[T.worker] -0.1525 0.0763 -1.9981 0.0462
education 0.0759 0.0101 7.5449 0.0000
experience 0.0119 0.0017 7.0895 0.0000
# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f"table_anova: \n{table_anova}\n")
table_anova:
sum_sq df F PR(>F)
gender 5.414018 1.0 28.067296 1.727015e-07
occupation 7.152529 5.0 7.416013 9.805485e-07
education 10.980589 1.0 56.925450 2.010374e-13
experience 9.695055 1.0 50.261001 4.365391e-12
Residual 101.269451 525.0 NaN NaN
7.4 Breaking a Numeric Variable Into Categories¶
Example 7.8: Effects of Law School Rankings on Starting Salaries¶
lawsch85 = wool.data("lawsch85")
# define cut points for the rank:
cutpts = [0, 10, 25, 40, 60, 100, 175]
# create categorical variable containing ranges for the rank:
lawsch85["rc"] = pd.cut(
lawsch85["rank"],
bins=cutpts,
labels=["(0,10]", "(10,25]", "(25,40]", "(40,60]", "(60,100]", "(100,175]"],
)
# display frequencies:
freq = pd.crosstab(lawsch85["rc"], columns="count")
print(f"freq: \n{freq}\n")
freq:
col_0 count
rc
(0,10] 10
(10,25] 16
(25,40] 13
(40,60] 18
(60,100] 37
(100,175] 62
# run regression:
reg = smf.ols(
formula='np.log(salary) ~ C(rc, Treatment("(100,175]")) +'
"LSAT + GPA + np.log(libvol) + np.log(cost)",
data=lawsch85,
)
results = reg.fit()
# print regression table:
table_reg = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table_reg: \n{table_reg}\n")
table_reg:
b se t pval
Intercept 9.1653 0.4114 22.2770 0.0000
C(rc, Treatment("(100,175]"))[T.(0,10]] 0.6996 0.0535 13.0780 0.0000
C(rc, Treatment("(100,175]"))[T.(10,25]] 0.5935 0.0394 15.0493 0.0000
C(rc, Treatment("(100,175]"))[T.(25,40]] 0.3751 0.0341 11.0054 0.0000
C(rc, Treatment("(100,175]"))[T.(40,60]] 0.2628 0.0280 9.3991 0.0000
C(rc, Treatment("(100,175]"))[T.(60,100]] 0.1316 0.0210 6.2540 0.0000
LSAT 0.0057 0.0031 1.8579 0.0655
GPA 0.0137 0.0742 0.1850 0.8535
np.log(libvol) 0.0364 0.0260 1.3976 0.1647
np.log(cost) 0.0008 0.0251 0.0335 0.9734
# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f"table_anova: \n{table_anova}\n")
table_anova:
sum_sq df F PR(>F)
C(rc, Treatment("(100,175]")) 1.868867 5.0 50.962988 1.174406e-28
LSAT 0.025317 1.0 3.451900 6.551320e-02
GPA 0.000251 1.0 0.034225 8.535262e-01
np.log(libvol) 0.014327 1.0 1.953419 1.646748e-01
np.log(cost) 0.000008 1.0 0.001120 9.733564e-01
Residual 0.924111 126.0 NaN NaN
7.5 Interactions and Differences in Regression Functions Across Groups¶
gpa3 = wool.data("gpa3")
# model with full interactions with female dummy (only for spring data):
reg = smf.ols(
formula="cumgpa ~ female * (sat + hsperc + tothrs)",
data=gpa3,
subset=(gpa3["spring"] == 1),
)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept 1.4808 0.2073 7.1422 0.0000
female -0.3535 0.4105 -0.8610 0.3898
sat 0.0011 0.0002 5.8073 0.0000
hsperc -0.0085 0.0014 -6.1674 0.0000
tothrs 0.0023 0.0009 2.7182 0.0069
female:sat 0.0008 0.0004 1.9488 0.0521
female:hsperc -0.0005 0.0032 -0.1739 0.8621
female:tothrs -0.0001 0.0016 -0.0712 0.9433
# F-Test for H0 (the interaction coefficients of 'female' are zero):
hypotheses = ["female = 0", "female:sat = 0", "female:hsperc = 0", "female:tothrs = 0"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue
print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")
fstat: 8.179111637044619
fpval: 2.544637191829608e-06
gpa3 = wool.data("gpa3")
# estimate model for males (& spring data):
reg_m = smf.ols(
formula="cumgpa ~ sat + hsperc + tothrs",
data=gpa3,
subset=(gpa3["spring"] == 1) & (gpa3["female"] == 0),
)
results_m = reg_m.fit()
# print regression table:
table_m = pd.DataFrame(
{
"b": round(results_m.params, 4),
"se": round(results_m.bse, 4),
"t": round(results_m.tvalues, 4),
"pval": round(results_m.pvalues, 4),
},
)
print(f"table_m: \n{table_m}\n")
table_m:
b se t pval
Intercept 1.4808 0.2060 7.1894 0.0000
sat 0.0011 0.0002 5.8458 0.0000
hsperc -0.0085 0.0014 -6.2082 0.0000
tothrs 0.0023 0.0009 2.7362 0.0066
# estimate model for females (& spring data):
reg_f = smf.ols(
formula="cumgpa ~ sat + hsperc + tothrs",
data=gpa3,
subset=(gpa3["spring"] == 1) & (gpa3["female"] == 1),
)
results_f = reg_f.fit()
# print regression table:
table_f = pd.DataFrame(
{
"b": round(results_f.params, 4),
"se": round(results_f.bse, 4),
"t": round(results_f.tvalues, 4),
"pval": round(results_f.pvalues, 4),
},
)
print(f"table_f: \n{table_f}\n")
table_f:
b se t pval
Intercept 1.1273 0.3616 3.1176 0.0025
sat 0.0018 0.0003 5.1950 0.0000
hsperc -0.0090 0.0029 -3.0956 0.0027
tothrs 0.0022 0.0014 1.5817 0.1174