library(ColeridgeInitiative)
library(tidyverse)
library(dbplyr)
library(zoo) ## need to installImputation Illustration
options(java.parameters = c("-XX:+UseConcMarkSweepGC", "-Xmx16000m"))
gc()
con <- adrf_redshift(usertype = "training")
select
ncm.*,
fpuw.year_quarter_key,
fpuw.ui_quarterly_wages
from tr_state_impact_ada_training.nb_analysis_matched ncm
left join tr_state_impact_ada_training.fact_person_ui_wage fpuw
on ncm.person_key = fpuw.person_key
and (fpuw.year_quarter_key >=27 and fpuw.year_quarter_key <= 39 or fpuw.ui_quarterly_wages is null)
order by ncm.person_key, fpuw.year_quarter_key Releveling a factor reference category
table(data_did$eth_recode)
data_did <- data_did |>
mutate(eth_recode = fct_relevel(eth_recode, "Wht"))
table(data_did$eth_recode)Imputation examples
group wise modal imputation for categorical variables
must install the statip package
#install.packages('statip')
# Group-wise modal imputation example
data_did_modal <- data_did |>
group_by(eth_recode) |>
mutate(
gender = ifelse(is.na(gender),
as.character(statip::mfv(gender, na_rm = TRUE)),
as.character(gender))
) %>%
ungroup()group wise median imputation for continuous variables
#install.packages('statip')
# Group-wise modal imputation example
data_did_median <- data_did |>
group_by(eth_recode) |>
mutate(
quartery_wages_imp = if_else(is.na(ui_quarterly_wages)==T,
median(data_did$ui_quarterly_wages, na.rm=T),
ui_quarterly_wages)
) %>%
ungroup()Nearest neighbor imputation
specify the variables you want imputed in the “variable” list, and the variables you want to use to measure the closeness of a given observation in the “dist_var” list
# kNN imputation example
#install.packages('VIM')
df_knn <- VIM::kNN(data_did, k = 3,
variable = c("gender", "age_cat", "edu_cat", "wages_q37", "wages_q41", "ui_quarterly_wages"),
dist_var = c("gender", "age_cat", "edu_cat"),
imp_var = FALSE)
df_knn
summary(data_did$ui_quarterly_wages)
summary(df_knn$ui_quarterly_wages)