Notes:
- Takes the data from Civil Rights Data | U.S. Department of Education
- This rolls it up to the school district level by summing numeric columns and “or”ing boolean columns
- Filters to a specific state (PA)
- Source data is 733mb and result is 3.7mb
- Data definitions are here: crdcGlossaryAndGuide.pdf (nysed.gov)
library(dplyr)
crdc <- "C:\\data\\CRDC\\CRDC\\School"
load_csv_files <- function(folder_path) {
csv_files <- list.files(folder_path, pattern = "\\.csv$", full.names = TRUE)
loaded_dataframes <- list()
for (csv_file in csv_files) {
dataframe <- read.csv(csv_file)
pa_data <- dataframe %>% filter(LEA_STATE == 'PA')
loaded_dataframes[[basename(csv_file)]] <- pa_data
}
return(loaded_dataframes)
}
all_data <- load_csv_files(crdc)
group_and_summarize <- function(data_frame) {
grouped_data <- data_frame %>%
group_by(LEA_STATE, LEA_STATE_NAME, LEAID, LEA_NAME) %>%
select(-SCHID, -SCH_NAME, -COMBOKEY, -JJ)
grouped_data[grouped_data<0] <- NA
return (grouped_data %>% summarise_all(function(x) {
if (is.numeric(x)) {
sum(x, na.rm = TRUE)
} else if (is.logical(x)) {
any(x)
} else {
first(x)
}
}))
}
summarize_all_dfs <- function(all_data) {
summarized_dataframes <- list()
for (name in names(all_data)) {
summarized_dataframes[[name]] <- group_and_summarize(all_data[[name]])
}
return (summarized_dataframes)
}
all_summarized <- summarize_all_dfs(all_data)
write_dataframes_to_csv <- function(dataframes_list, output_folder) {
if (!file.exists(output_folder)) {
dir.create(output_folder)
}
for (csv_name in names(dataframes_list)) {
output_path <- file.path(output_folder, csv_name)
write.csv(dataframes_list[[csv_name]], file = output_path, row.names = FALSE)
}
}
write_dataframes_to_csv (all_summarized, "C:\\data\\CRDC\\CRDC\\School\\summarized")