%pip install matplotlib numpy pandas statsmodels wooldridge -q
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import wooldridge as wool
import numpy as np
wage1 = wool.data("wage1")
reg = smf.ols(formula="wage ~ female + educ + exper + tenure", data=wage1)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept -1.5679 0.7246 -2.1640 0.0309
female -1.8109 0.2648 -6.8379 0.0000
educ 0.5715 0.0493 11.5836 0.0000
exper 0.0254 0.0116 2.1951 0.0286
tenure 0.1410 0.0212 6.6632 0.0000
Example 7.6: Log Hourly Wage Equation¶
wage1 = wool.data("wage1")
reg = smf.ols(
formula="np.log(wage) ~ married*female + educ + exper +"
"I(exper**2) + tenure + I(tenure**2)",
data=wage1,
)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept 0.3214 0.1000 3.2135 0.0014
married 0.2127 0.0554 3.8419 0.0001
female -0.1104 0.0557 -1.9797 0.0483
married:female -0.3006 0.0718 -4.1885 0.0000
educ 0.0789 0.0067 11.7873 0.0000
exper 0.0268 0.0052 5.1118 0.0000
I(exper ** 2) -0.0005 0.0001 -4.8471 0.0000
tenure 0.0291 0.0068 4.3016 0.0000
I(tenure ** 2) -0.0005 0.0002 -2.3056 0.0215
7.2 Boolean variables¶
wage1 = wool.data("wage1")
# regression with boolean variable:
wage1["isfemale"] = wage1["female"] == 1
reg = smf.ols(formula="wage ~ isfemale + educ + exper + tenure", data=wage1)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept -1.5679 0.7246 -2.1640 0.0309
isfemale[T.True] -1.8109 0.2648 -6.8379 0.0000
educ 0.5715 0.0493 11.5836 0.0000
exper 0.0254 0.0116 2.1951 0.0286
tenure 0.1410 0.0212 6.6632 0.0000
7.3 Categorical Variables¶
CPS1985 = pd.read_csv("../data/CPS1985.csv")
# rename variable to make outputs more compact:
CPS1985["oc"] = CPS1985["occupation"]
# table of categories and frequencies for two categorical variables:
freq_gender = pd.crosstab(CPS1985["gender"], columns="count")
print(f"freq_gender: \n{freq_gender}\n")
freq_occupation = pd.crosstab(CPS1985["oc"], columns="count")
print(f"freq_occupation: \n{freq_occupation}\n")
freq_gender:
col_0 count
gender
female 245
male 289
freq_occupation:
col_0 count
oc
management 55
office 97
sales 38
services 83
technical 105
worker 156
# directly using categorical variables in regression formula:
reg = smf.ols(
formula="np.log(wage) ~ education +experience + C(gender) + C(oc)",
data=CPS1985,
)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept 0.9050 0.1717 5.2718 0.0000
C(gender)[T.male] 0.2238 0.0423 5.2979 0.0000
C(oc)[T.office] -0.2073 0.0776 -2.6699 0.0078
C(oc)[T.sales] -0.3601 0.0936 -3.8455 0.0001
C(oc)[T.services] -0.3626 0.0818 -4.4305 0.0000
C(oc)[T.technical] -0.0101 0.0740 -0.1363 0.8916
C(oc)[T.worker] -0.1525 0.0763 -1.9981 0.0462
education 0.0759 0.0101 7.5449 0.0000
experience 0.0119 0.0017 7.0895 0.0000
# rerun regression with different reference category:
reg_newref = smf.ols(
formula="np.log(wage) ~ education + experience + "
'C(gender, Treatment("male")) + '
'C(oc, Treatment("technical"))',
data=CPS1985,
)
results_newref = reg_newref.fit()
# print results:
table_newref = pd.DataFrame(
{
"b": round(results_newref.params, 4),
"se": round(results_newref.bse, 4),
"t": round(results_newref.tvalues, 4),
"pval": round(results_newref.pvalues, 4),
},
)
print(f"table_newref: \n{table_newref}\n")
table_newref:
b se t pval
Intercept 1.1187 0.1765 6.3393 0.0000
C(gender, Treatment("male"))[T.female] -0.2238 0.0423 -5.2979 0.0000
C(oc, Treatment("technical"))[T.management] 0.0101 0.0740 0.1363 0.8916
C(oc, Treatment("technical"))[T.office] -0.1972 0.0678 -2.9082 0.0038
C(oc, Treatment("technical"))[T.sales] -0.3500 0.0863 -4.0541 0.0001
C(oc, Treatment("technical"))[T.services] -0.3525 0.0750 -4.7030 0.0000
C(oc, Treatment("technical"))[T.worker] -0.1425 0.0705 -2.0218 0.0437
education 0.0759 0.0101 7.5449 0.0000
experience 0.0119 0.0017 7.0895 0.0000
7.3.1 ANOVA Tables¶
CPS1985 = pd.read_csv("../data/CPS1985.csv")
# run regression:
reg = smf.ols(
formula="np.log(wage) ~ education + experience + gender + occupation",
data=CPS1985,
)
results = reg.fit()
# print regression table:
table_reg = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table_reg: \n{table_reg}\n")
table_reg:
b se t pval
Intercept 0.9050 0.1717 5.2718 0.0000
gender[T.male] 0.2238 0.0423 5.2979 0.0000
occupation[T.office] -0.2073 0.0776 -2.6699 0.0078
occupation[T.sales] -0.3601 0.0936 -3.8455 0.0001
occupation[T.services] -0.3626 0.0818 -4.4305 0.0000
occupation[T.technical] -0.0101 0.0740 -0.1363 0.8916
occupation[T.worker] -0.1525 0.0763 -1.9981 0.0462
education 0.0759 0.0101 7.5449 0.0000
experience 0.0119 0.0017 7.0895 0.0000
# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f"table_anova: \n{table_anova}\n")
table_anova:
sum_sq df F PR(>F)
gender 5.414018 1.0 28.067296 1.727015e-07
occupation 7.152529 5.0 7.416013 9.805485e-07
education 10.980589 1.0 56.925450 2.010374e-13
experience 9.695055 1.0 50.261001 4.365391e-12
Residual 101.269451 525.0 NaN NaN
7.4 Breaking a Numeric Variable Into Categories¶
Example 7.8: Effects of Law School Rankings on Starting Salaries¶
lawsch85 = wool.data("lawsch85")
# define cut points for the rank:
cutpts = [0, 10, 25, 40, 60, 100, 175]
# create categorical variable containing ranges for the rank:
lawsch85["rc"] = pd.cut(
lawsch85["rank"],
bins=cutpts,
labels=["(0,10]", "(10,25]", "(25,40]", "(40,60]", "(60,100]", "(100,175]"],
)
# display frequencies:
freq = pd.crosstab(lawsch85["rc"], columns="count")
print(f"freq: \n{freq}\n")
freq:
col_0 count
rc
(0,10] 10
(10,25] 16
(25,40] 13
(40,60] 18
(60,100] 37
(100,175] 62
# run regression:
reg = smf.ols(
formula='np.log(salary) ~ C(rc, Treatment("(100,175]")) +'
"LSAT + GPA + np.log(libvol) + np.log(cost)",
data=lawsch85,
)
results = reg.fit()
# print regression table:
table_reg = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table_reg: \n{table_reg}\n")
table_reg:
b se t pval
Intercept 9.1653 0.4114 22.2770 0.0000
C(rc, Treatment("(100,175]"))[T.(0,10]] 0.6996 0.0535 13.0780 0.0000
C(rc, Treatment("(100,175]"))[T.(10,25]] 0.5935 0.0394 15.0493 0.0000
C(rc, Treatment("(100,175]"))[T.(25,40]] 0.3751 0.0341 11.0054 0.0000
C(rc, Treatment("(100,175]"))[T.(40,60]] 0.2628 0.0280 9.3991 0.0000
C(rc, Treatment("(100,175]"))[T.(60,100]] 0.1316 0.0210 6.2540 0.0000
LSAT 0.0057 0.0031 1.8579 0.0655
GPA 0.0137 0.0742 0.1850 0.8535
np.log(libvol) 0.0364 0.0260 1.3976 0.1647
np.log(cost) 0.0008 0.0251 0.0335 0.9734
# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f"table_anova: \n{table_anova}\n")
table_anova:
sum_sq df F PR(>F)
C(rc, Treatment("(100,175]")) 1.868867 5.0 50.962988 1.174406e-28
LSAT 0.025317 1.0 3.451900 6.551320e-02
GPA 0.000251 1.0 0.034225 8.535262e-01
np.log(libvol) 0.014327 1.0 1.953419 1.646748e-01
np.log(cost) 0.000008 1.0 0.001120 9.733564e-01
Residual 0.924111 126.0 NaN NaN
7.5 Interactions and Differences in Regression Functions Across Groups¶
gpa3 = wool.data("gpa3")
# model with full interactions with female dummy (only for spring data):
reg = smf.ols(
formula="cumgpa ~ female * (sat + hsperc + tothrs)",
data=gpa3,
subset=(gpa3["spring"] == 1),
)
results = reg.fit()
# print regression table:
table = pd.DataFrame(
{
"b": round(results.params, 4),
"se": round(results.bse, 4),
"t": round(results.tvalues, 4),
"pval": round(results.pvalues, 4),
},
)
print(f"table: \n{table}\n")
table:
b se t pval
Intercept 1.4808 0.2073 7.1422 0.0000
female -0.3535 0.4105 -0.8610 0.3898
sat 0.0011 0.0002 5.8073 0.0000
hsperc -0.0085 0.0014 -6.1674 0.0000
tothrs 0.0023 0.0009 2.7182 0.0069
female:sat 0.0008 0.0004 1.9488 0.0521
female:hsperc -0.0005 0.0032 -0.1739 0.8621
female:tothrs -0.0001 0.0016 -0.0712 0.9433
# F-Test for H0 (the interaction coefficients of 'female' are zero):
hypotheses = ["female = 0", "female:sat = 0", "female:hsperc = 0", "female:tothrs = 0"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue
print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")
fstat: 8.179111637044619
fpval: 2.544637191829608e-06
gpa3 = wool.data("gpa3")
# estimate model for males (& spring data):
reg_m = smf.ols(
formula="cumgpa ~ sat + hsperc + tothrs",
data=gpa3,
subset=(gpa3["spring"] == 1) & (gpa3["female"] == 0),
)
results_m = reg_m.fit()
# print regression table:
table_m = pd.DataFrame(
{
"b": round(results_m.params, 4),
"se": round(results_m.bse, 4),
"t": round(results_m.tvalues, 4),
"pval": round(results_m.pvalues, 4),
},
)
print(f"table_m: \n{table_m}\n")
table_m:
b se t pval
Intercept 1.4808 0.2060 7.1894 0.0000
sat 0.0011 0.0002 5.8458 0.0000
hsperc -0.0085 0.0014 -6.2082 0.0000
tothrs 0.0023 0.0009 2.7362 0.0066
# estimate model for females (& spring data):
reg_f = smf.ols(
formula="cumgpa ~ sat + hsperc + tothrs",
data=gpa3,
subset=(gpa3["spring"] == 1) & (gpa3["female"] == 1),
)
results_f = reg_f.fit()
# print regression table:
table_f = pd.DataFrame(
{
"b": round(results_f.params, 4),
"se": round(results_f.bse, 4),
"t": round(results_f.tvalues, 4),
"pval": round(results_f.pvalues, 4),
},
)
print(f"table_f: \n{table_f}\n")
table_f:
b se t pval
Intercept 1.1273 0.3616 3.1176 0.0025
sat 0.0018 0.0003 5.1950 0.0000
hsperc -0.0090 0.0029 -3.0956 0.0027
tothrs 0.0022 0.0014 1.5817 0.1174