Exercise

library(readxl)
census_2006 <- read_xlsx("data/ABS_ed_age_sex_2006.xlsx", skip=9, n_max=10)
census_2011 <- read_xlsx("data/ABS_ed_age_sex_2011.xlsx", skip=9, n_max=10)
census_2016 <- read_xlsx("data/ABS_ed_age_sex_2016.xlsx", skip=9, n_max=10)

The first 9 rows are general information about the data, not actual data. There are 10 rows of data, containing categories of education. The 10th row contains the sum of the other rows. We could drop this and re-calculate totals on the fly.

  1. Tidy the data, to produce a data set with these columns: census, educ, age, sex, count

The format is really awkward! The main problem in that pairs of columns are age groups, and every second column is a different gender. A first step is getting these column labels to be descriptive, and then gather and separate. The first column contains nothing useful and should be dropped. Also drop the last two columns because these are totals for the education level.

library(tidyverse)
census_2006_long <- census_2006 %>%
  select(-`Sex Male/Female (SEXP)`, -Male__21, -Female__21) %>%
  rename(educ=X__1, 
         m_0004=Male, f_0004=Female, 
         m_0509=Male__1, f_0509=Female__1,
         m_1014=Male__2, f_1014=Female__2,
         m_1519=Male__3, f_1519=Female__3,
         m_2024=Male__4, f_2024=Female__4,
         m_2529=Male__5, f_2529=Female__5,
         m_3034=Male__6, f_3034=Female__6,
         m_3539=Male__7, f_3539=Female__7,
         m_4044=Male__8, f_4044=Female__8,
         m_4549=Male__9, f_4549=Female__9,
         m_5054=Male__10, f_5054=Female__10,
         m_5559=Male__11, f_5559=Female__11,
         m_6064=Male__12, f_6064=Female__12,
         m_6569=Male__13, f_6569=Female__13,
         m_7074=Male__14, f_7074=Female__14,
         m_7579=Male__15, f_7579=Female__15,
         m_8084=Male__16, f_8084=Female__16,
         m_8589=Male__17, f_8589=Female__17,
         m_9094=Male__18, f_9094=Female__18,
         m_9599=Male__19, f_9599=Female__19,
         m_100=Male__20, f_100=Female__20) %>%
  filter(educ != "Total") %>%
  gather(label, count, -educ) %>%
  separate(label, c("gender", "age"))
  1. Combine the data from the three different census periods

The same code from question 1 is repeated. We add a new column called year (2006, 2011, 2016), and bind the rows together.

library(tidyverse)
census_2011_long <- census_2011 %>%
  select(-`Sex (SEXP)`, -Male__21, -Female__21) %>%
  rename(educ=X__1, 
         m_0004=Male, f_0004=Female, 
         m_0509=Male__1, f_0509=Female__1,
         m_1014=Male__2, f_1014=Female__2,
         m_1519=Male__3, f_1519=Female__3,
         m_2024=Male__4, f_2024=Female__4,
         m_2529=Male__5, f_2529=Female__5,
         m_3034=Male__6, f_3034=Female__6,
         m_3539=Male__7, f_3539=Female__7,
         m_4044=Male__8, f_4044=Female__8,
         m_4549=Male__9, f_4549=Female__9,
         m_5054=Male__10, f_5054=Female__10,
         m_5559=Male__11, f_5559=Female__11,
         m_6064=Male__12, f_6064=Female__12,
         m_6569=Male__13, f_6569=Female__13,
         m_7074=Male__14, f_7074=Female__14,
         m_7579=Male__15, f_7579=Female__15,
         m_8084=Male__16, f_8084=Female__16,
         m_8589=Male__17, f_8589=Female__17,
         m_9094=Male__18, f_9094=Female__18,
         m_9599=Male__19, f_9599=Female__19,
         m_100=Male__20, f_100=Female__20) %>%
  filter(educ != "Total") %>%
  gather(label, count, -educ) %>%
  separate(label, c("gender", "age"))
census_2016_long <- census_2016 %>%
  select(-`SEXP Sex`, -Male__21, -Female__21) %>%
  rename(educ=X__1, 
         m_0004=Male, f_0004=Female, 
         m_0509=Male__1, f_0509=Female__1,
         m_1014=Male__2, f_1014=Female__2,
         m_1519=Male__3, f_1519=Female__3,
         m_2024=Male__4, f_2024=Female__4,
         m_2529=Male__5, f_2529=Female__5,
         m_3034=Male__6, f_3034=Female__6,
         m_3539=Male__7, f_3539=Female__7,
         m_4044=Male__8, f_4044=Female__8,
         m_4549=Male__9, f_4549=Female__9,
         m_5054=Male__10, f_5054=Female__10,
         m_5559=Male__11, f_5559=Female__11,
         m_6064=Male__12, f_6064=Female__12,
         m_6569=Male__13, f_6569=Female__13,
         m_7074=Male__14, f_7074=Female__14,
         m_7579=Male__15, f_7579=Female__15,
         m_8084=Male__16, f_8084=Female__16,
         m_8589=Male__17, f_8589=Female__17,
         m_9094=Male__18, f_9094=Female__18,
         m_9599=Male__19, f_9599=Female__19,
         m_100=Male__20, f_100=Female__20) %>%
  filter(educ != "Total") %>%
  gather(label, count, -educ) %>%
  separate(label, c("gender", "age"))

Now create the new variable, and bind the rows.

census_2006_long <- census_2006_long %>% 
  mutate(year = 2006)
census_2011_long <- census_2011_long %>% 
  mutate(year = 2011)
census_2016_long <- census_2016_long %>% 
  mutate(year = 2016)
census <- bind_rows(census_2006_long, census_2011_long, census_2016_long)
  1. Aggregate the counts across sex, and age, and compute the totals across educ, for each census. Plot these totals against the census year, using a line connecting values for each educ group. Write a couple of paragraphs about what you learn.
census_educ <- census %>% 
  group_by(year, educ) %>%
  summarise(count = sum(count, na.rm=TRUE))
ggplot(census_educ, aes(x=year, y=count, colour=educ)) + 
  geom_point() + geom_line()

The main thing to notice is that there are a lot of education categories, and they are not all the same from one year to another. Filter out the non-categories, e.g. “Not applicable”, to focus on the different education levels in the plot. The categories of high school education have changed, and the categories of certificates have changed. We would need to do some re-working of these category names to compare these across years. This will be covered later in the course.

census_educ_short <- census_educ %>%
  filter(educ %in% c("Advanced Diploma and Diploma Level",
                     "Bachelor Degree Level", 
                     "Graduate Diploma and Graduate Certificate Level",
                     "Postgraduate Degree Level"))
ggplot(census_educ_short, aes(x=year, y=count, colour=educ)) + 
  geom_point() + geom_line()

The number of people with all levels of university education have increased over this time period.

  1. Subset the data to age group 25-29. Make a plot of educ counts by census year, faceted by sex. Write a couple of paragraphs about what you learn.
census %>% filter(age == "2529") %>%
  filter(educ %in% c("Advanced Diploma and Diploma Level",
                     "Bachelor Degree Level", 
                     "Graduate Diploma and Graduate Certificate Level",
                     "Postgraduate Degree Level")) %>%
  ggplot(aes(x=year, y=count, colour=educ)) + 
  geom_point() + geom_line() +
  facet_wrap(~gender)

Over both women and men, numbers are increasing. In each education level number of women is high than men.

  1. Subset the data to age groups, 15-19, 20-24, 25-29, 30-34. Make a plot of educ counts by census year, faceted by age group and sex. Write a couple of paragraphs about what you learn.
census %>% filter(age %in% c("1519", "2024", "2529", "3034")) %>%
  filter(educ %in% c("Advanced Diploma and Diploma Level",
                     "Bachelor Degree Level", 
                     "Graduate Diploma and Graduate Certificate Level",
                     "Postgraduate Degree Level")) %>%
  ggplot(aes(x=year, y=count, colour=educ)) + 
  geom_point() + geom_line() +
  facet_grid(age~gender)