Skip to article frontmatterSkip to article content

7. Multiple Regression Analysis with Qualitative Regressors

%pip install matplotlib numpy pandas statsmodels wooldridge -q
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import wooldridge as wool
import numpy as np

7.1 Linear Regression with Dummy Variables as Regressors

Example 7.1: Hourly Wage Equation

wage1 = wool.data("wage1")

reg = smf.ols(formula="wage ~ female + educ + exper + tenure", data=wage1)
results = reg.fit()

# print regression table:
table = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table: \n{table}\n")
table: 
                b      se        t    pval
Intercept -1.5679  0.7246  -2.1640  0.0309
female    -1.8109  0.2648  -6.8379  0.0000
educ       0.5715  0.0493  11.5836  0.0000
exper      0.0254  0.0116   2.1951  0.0286
tenure     0.1410  0.0212   6.6632  0.0000

Example 7.6: Log Hourly Wage Equation

wage1 = wool.data("wage1")

reg = smf.ols(
    formula="np.log(wage) ~ married*female + educ + exper +"
    "I(exper**2) + tenure + I(tenure**2)",
    data=wage1,
)
results = reg.fit()

# print regression table:
table = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table: \n{table}\n")
table: 
                     b      se        t    pval
Intercept       0.3214  0.1000   3.2135  0.0014
married         0.2127  0.0554   3.8419  0.0001
female         -0.1104  0.0557  -1.9797  0.0483
married:female -0.3006  0.0718  -4.1885  0.0000
educ            0.0789  0.0067  11.7873  0.0000
exper           0.0268  0.0052   5.1118  0.0000
I(exper ** 2)  -0.0005  0.0001  -4.8471  0.0000
tenure          0.0291  0.0068   4.3016  0.0000
I(tenure ** 2) -0.0005  0.0002  -2.3056  0.0215

7.2 Boolean variables

wage1 = wool.data("wage1")

# regression with boolean variable:
wage1["isfemale"] = wage1["female"] == 1
reg = smf.ols(formula="wage ~ isfemale + educ + exper + tenure", data=wage1)
results = reg.fit()

# print regression table:
table = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table: \n{table}\n")
table: 
                       b      se        t    pval
Intercept        -1.5679  0.7246  -2.1640  0.0309
isfemale[T.True] -1.8109  0.2648  -6.8379  0.0000
educ              0.5715  0.0493  11.5836  0.0000
exper             0.0254  0.0116   2.1951  0.0286
tenure            0.1410  0.0212   6.6632  0.0000

7.3 Categorical Variables

CPS1985 = pd.read_csv("../data/CPS1985.csv")
# rename variable to make outputs more compact:
CPS1985["oc"] = CPS1985["occupation"]

# table of categories and frequencies for two categorical variables:
freq_gender = pd.crosstab(CPS1985["gender"], columns="count")
print(f"freq_gender: \n{freq_gender}\n")

freq_occupation = pd.crosstab(CPS1985["oc"], columns="count")
print(f"freq_occupation: \n{freq_occupation}\n")
freq_gender: 
col_0   count
gender       
female    245
male      289

freq_occupation: 
col_0       count
oc               
management     55
office         97
sales          38
services       83
technical     105
worker        156

# directly using categorical variables in regression formula:
reg = smf.ols(
    formula="np.log(wage) ~ education +experience + C(gender) + C(oc)",
    data=CPS1985,
)
results = reg.fit()

# print regression table:
table = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table: \n{table}\n")
table: 
                         b      se       t    pval
Intercept           0.9050  0.1717  5.2718  0.0000
C(gender)[T.male]   0.2238  0.0423  5.2979  0.0000
C(oc)[T.office]    -0.2073  0.0776 -2.6699  0.0078
C(oc)[T.sales]     -0.3601  0.0936 -3.8455  0.0001
C(oc)[T.services]  -0.3626  0.0818 -4.4305  0.0000
C(oc)[T.technical] -0.0101  0.0740 -0.1363  0.8916
C(oc)[T.worker]    -0.1525  0.0763 -1.9981  0.0462
education           0.0759  0.0101  7.5449  0.0000
experience          0.0119  0.0017  7.0895  0.0000

# rerun regression with different reference category:
reg_newref = smf.ols(
    formula="np.log(wage) ~ education + experience + "
    'C(gender, Treatment("male")) + '
    'C(oc, Treatment("technical"))',
    data=CPS1985,
)
results_newref = reg_newref.fit()

# print results:
table_newref = pd.DataFrame(
    {
        "b": round(results_newref.params, 4),
        "se": round(results_newref.bse, 4),
        "t": round(results_newref.tvalues, 4),
        "pval": round(results_newref.pvalues, 4),
    },
)
print(f"table_newref: \n{table_newref}\n")
table_newref: 
                                                  b      se       t    pval
Intercept                                    1.1187  0.1765  6.3393  0.0000
C(gender, Treatment("male"))[T.female]      -0.2238  0.0423 -5.2979  0.0000
C(oc, Treatment("technical"))[T.management]  0.0101  0.0740  0.1363  0.8916
C(oc, Treatment("technical"))[T.office]     -0.1972  0.0678 -2.9082  0.0038
C(oc, Treatment("technical"))[T.sales]      -0.3500  0.0863 -4.0541  0.0001
C(oc, Treatment("technical"))[T.services]   -0.3525  0.0750 -4.7030  0.0000
C(oc, Treatment("technical"))[T.worker]     -0.1425  0.0705 -2.0218  0.0437
education                                    0.0759  0.0101  7.5449  0.0000
experience                                   0.0119  0.0017  7.0895  0.0000

7.3.1 ANOVA Tables

CPS1985 = pd.read_csv("../data/CPS1985.csv")

# run regression:
reg = smf.ols(
    formula="np.log(wage) ~ education + experience + gender + occupation",
    data=CPS1985,
)
results = reg.fit()

# print regression table:
table_reg = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table_reg: \n{table_reg}\n")
table_reg: 
                              b      se       t    pval
Intercept                0.9050  0.1717  5.2718  0.0000
gender[T.male]           0.2238  0.0423  5.2979  0.0000
occupation[T.office]    -0.2073  0.0776 -2.6699  0.0078
occupation[T.sales]     -0.3601  0.0936 -3.8455  0.0001
occupation[T.services]  -0.3626  0.0818 -4.4305  0.0000
occupation[T.technical] -0.0101  0.0740 -0.1363  0.8916
occupation[T.worker]    -0.1525  0.0763 -1.9981  0.0462
education                0.0759  0.0101  7.5449  0.0000
experience               0.0119  0.0017  7.0895  0.0000

# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f"table_anova: \n{table_anova}\n")
table_anova: 
                sum_sq     df          F        PR(>F)
gender        5.414018    1.0  28.067296  1.727015e-07
occupation    7.152529    5.0   7.416013  9.805485e-07
education    10.980589    1.0  56.925450  2.010374e-13
experience    9.695055    1.0  50.261001  4.365391e-12
Residual    101.269451  525.0        NaN           NaN

7.4 Breaking a Numeric Variable Into Categories

Example 7.8: Effects of Law School Rankings on Starting Salaries

lawsch85 = wool.data("lawsch85")

# define cut points for the rank:
cutpts = [0, 10, 25, 40, 60, 100, 175]

# create categorical variable containing ranges for the rank:
lawsch85["rc"] = pd.cut(
    lawsch85["rank"],
    bins=cutpts,
    labels=["(0,10]", "(10,25]", "(25,40]", "(40,60]", "(60,100]", "(100,175]"],
)

# display frequencies:
freq = pd.crosstab(lawsch85["rc"], columns="count")
print(f"freq: \n{freq}\n")
freq: 
col_0      count
rc              
(0,10]        10
(10,25]       16
(25,40]       13
(40,60]       18
(60,100]      37
(100,175]     62

# run regression:
reg = smf.ols(
    formula='np.log(salary) ~ C(rc, Treatment("(100,175]")) +'
    "LSAT + GPA + np.log(libvol) + np.log(cost)",
    data=lawsch85,
)
results = reg.fit()

# print regression table:
table_reg = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table_reg: \n{table_reg}\n")
table_reg: 
                                                b      se        t    pval
Intercept                                  9.1653  0.4114  22.2770  0.0000
C(rc, Treatment("(100,175]"))[T.(0,10]]    0.6996  0.0535  13.0780  0.0000
C(rc, Treatment("(100,175]"))[T.(10,25]]   0.5935  0.0394  15.0493  0.0000
C(rc, Treatment("(100,175]"))[T.(25,40]]   0.3751  0.0341  11.0054  0.0000
C(rc, Treatment("(100,175]"))[T.(40,60]]   0.2628  0.0280   9.3991  0.0000
C(rc, Treatment("(100,175]"))[T.(60,100]]  0.1316  0.0210   6.2540  0.0000
LSAT                                       0.0057  0.0031   1.8579  0.0655
GPA                                        0.0137  0.0742   0.1850  0.8535
np.log(libvol)                             0.0364  0.0260   1.3976  0.1647
np.log(cost)                               0.0008  0.0251   0.0335  0.9734

# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f"table_anova: \n{table_anova}\n")
table_anova: 
                                 sum_sq     df          F        PR(>F)
C(rc, Treatment("(100,175]"))  1.868867    5.0  50.962988  1.174406e-28
LSAT                           0.025317    1.0   3.451900  6.551320e-02
GPA                            0.000251    1.0   0.034225  8.535262e-01
np.log(libvol)                 0.014327    1.0   1.953419  1.646748e-01
np.log(cost)                   0.000008    1.0   0.001120  9.733564e-01
Residual                       0.924111  126.0        NaN           NaN

7.5 Interactions and Differences in Regression Functions Across Groups

gpa3 = wool.data("gpa3")

# model with full interactions with female dummy (only for spring data):
reg = smf.ols(
    formula="cumgpa ~ female * (sat + hsperc + tothrs)",
    data=gpa3,
    subset=(gpa3["spring"] == 1),
)
results = reg.fit()

# print regression table:
table = pd.DataFrame(
    {
        "b": round(results.params, 4),
        "se": round(results.bse, 4),
        "t": round(results.tvalues, 4),
        "pval": round(results.pvalues, 4),
    },
)
print(f"table: \n{table}\n")
table: 
                    b      se       t    pval
Intercept      1.4808  0.2073  7.1422  0.0000
female        -0.3535  0.4105 -0.8610  0.3898
sat            0.0011  0.0002  5.8073  0.0000
hsperc        -0.0085  0.0014 -6.1674  0.0000
tothrs         0.0023  0.0009  2.7182  0.0069
female:sat     0.0008  0.0004  1.9488  0.0521
female:hsperc -0.0005  0.0032 -0.1739  0.8621
female:tothrs -0.0001  0.0016 -0.0712  0.9433

# F-Test for H0 (the interaction coefficients of 'female' are zero):
hypotheses = ["female = 0", "female:sat = 0", "female:hsperc = 0", "female:tothrs = 0"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")
fstat: 8.179111637044619

fpval: 2.544637191829608e-06

gpa3 = wool.data("gpa3")

# estimate model for males (& spring data):
reg_m = smf.ols(
    formula="cumgpa ~ sat + hsperc + tothrs",
    data=gpa3,
    subset=(gpa3["spring"] == 1) & (gpa3["female"] == 0),
)
results_m = reg_m.fit()

# print regression table:
table_m = pd.DataFrame(
    {
        "b": round(results_m.params, 4),
        "se": round(results_m.bse, 4),
        "t": round(results_m.tvalues, 4),
        "pval": round(results_m.pvalues, 4),
    },
)
print(f"table_m: \n{table_m}\n")
table_m: 
                b      se       t    pval
Intercept  1.4808  0.2060  7.1894  0.0000
sat        0.0011  0.0002  5.8458  0.0000
hsperc    -0.0085  0.0014 -6.2082  0.0000
tothrs     0.0023  0.0009  2.7362  0.0066

# estimate model for females (& spring data):
reg_f = smf.ols(
    formula="cumgpa ~ sat + hsperc + tothrs",
    data=gpa3,
    subset=(gpa3["spring"] == 1) & (gpa3["female"] == 1),
)
results_f = reg_f.fit()

# print regression table:
table_f = pd.DataFrame(
    {
        "b": round(results_f.params, 4),
        "se": round(results_f.bse, 4),
        "t": round(results_f.tvalues, 4),
        "pval": round(results_f.pvalues, 4),
    },
)
print(f"table_f: \n{table_f}\n")
table_f: 
                b      se       t    pval
Intercept  1.1273  0.3616  3.1176  0.0025
sat        0.0018  0.0003  5.1950  0.0000
hsperc    -0.0090  0.0029 -3.0956  0.0027
tothrs     0.0022  0.0014  1.5817  0.1174