worksheet-6.R 1.89 KB
Newer Older
Zheng Liu's avatar
Zheng Liu committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
# Linear models

library(readr)
library(dplyr)
library(ggplot2)
person <- read_csv(
  file = 'data/census_pums/sample.csv',
  col_types = cols_only(
    AGEP = 'i',  # Age
    WAGP = 'd',  # Wages or salary income past 12 months
    SCHL = 'i',  # Educational attainment
    SEX = 'f',   # Sex
    OCCP = 'f',  # Occupation recode based on 2010 OCC codes
    WKHP = 'i')) # Usual hours worked per week past 12 months


person <- within(person, {
  SCHL <- factor(SCHL)
  levels(SCHL) <- list(
    'Incomplete' = c(1:15),
    'High School' = 16,
    'College Credit' = 17:20,
    'Bachelor\'s' = 21,
    'Master\'s' = 22:23,
    'Doctorate' = 24)}) %>%
  filter(
    WAGP > 0,
    WAGP < max(WAGP, na.rm = TRUE))

# Formula Notation

fit <- lm(
  formula = WAGP ~ SCHL,
  data = person)

ggplot(person, aes(x=SCHL,y=WAGP)) + geom_boxplot()
fit

fit <- lm(
  formula = log(WAGP) ~ SCHL,
  data = person)
summary(fit)

fit <- lm(
  formula = log(WAGP) ~ AGEP,
  data = person)
summary(fit)
ggplot(person, aes(x=AGEP,y=log(WAGP))) + geom_point()
fit <- lm(
  ...,
  person)

# Metadata matters

fit <- lm(
  ...,
  person)

# GLM families

fit <- glm(log(WAGP)~SCHL,
  family = gaussian,
  data = person)
summary(fit)
# Logistic Regression

fit <- glm(SEX ~ WAGP,
  family = binomial,
  data = person)
summary(fit)

ggplot(person, aes(x=WAGP,y=SEX)) + geom_point()

anova(fit,update(fit,))

levels(person$SEX)
anova(fit, update(fit, SEX~1), test = 'Chisq')

# Random Intercept

library(lme4)
fit <- lmer(
  log(WAGP) ~ (1|OCCP) + SCHL,
  data = person)

summary(fit)
# Random Slope

fit <- lmer(
  log(WAGP) ~ (WKHP | SCHL),
  data = person)
summary(fit)


fit <- lmer(
  log(WAGP) ~ (WKHP | SCHL),
  data = person,
  control = lmerControl(optimizer = 'bobyqa'))

ggplot(person,
  aes(x = WKHP, y = log(WAGP), color = SCHL)) +
  geom_point() +
  geom_line(aes(y = predict(fit))) +
  labs(title = 'Random intercept and slope with lmer')