# import cleaned data
var_types <- "ccccDiicccidddddididididididididididididididididididddddc"
df_broiler <- read_csv(data_path, col_types = var_types)
# how many batches?
nrow(df_broiler) %>% format(big.mark = ",")
## [1] "571,276"
# find out how many animals?
sum(df_broiler$n_animals, na.rm = TRUE) %>% format(big.mark = ",")
## [1] "3,994,373,939"
# get filenames and load the datafiles into a list
file_names <- list.files(data_path, pattern = "^Broiler.*\\.csv$")
df_broiler <- map(file.path(data_path, file_names), read_csv)
names(df_broiler) <- file_names
# print number of columns for each data file
map_dbl(df_broiler, ncol)
## Broiler 2010.csv Broiler 2011.csv Broiler 2012.csv
## 57 57 57
## Broiler 2013-14.csv
## 56
# get all unique variable names across all dataframes
var_names <- df_broiler %>% map(names) %>% unlist %>% unique
# for each dataframe, check if each variable actually exists, and if not, print
# it out.
map(df_broiler,
function (x){
ix <-
var_names %>%
map_lgl(function (y) y %in% names(x) == FALSE) %>%
which
var_names[ix]
})
## $`Broiler 2010.csv`
## character(0)
##
## $`Broiler 2011.csv`
## character(0)
##
## $`Broiler 2012.csv`
## character(0)
##
## $`Broiler 2013-14.csv`
## [1] "AnimalHealthReport"
var_class <-
map_df(df_broiler,
function (x) {
map_chr(x, class)
})
ix <- apply(var_class, 1, function (x) length(unique(x)) > 1)
bind_cols(c(names(df_broiler[[1]])[which(ix)], var_class[which(ix), ]))
## # A tibble: 1 x 5
## V1 `Broiler 2010.c… `Broiler 2011.c… `Broiler 2012.c… `Broiler 2013-1…
## <chr> <chr> <chr> <chr> <chr>
## 1 Corr… character numeric numeric numeric
str(df_broiler$`Broiler 2010.csv`$Correction)
## chr [1:51028] "NULL" "NULL" "NULL" "NULL" "NULL" "NULL" "NULL" "NULL" ...
df_broiler$`Broiler 2010.csv`$Correction <-
as.integer(df_broiler$`Broiler 2010.csv`$Correction)
# combine list with dataframes into a single dataframe
df_broiler <- bind_rows(df_broiler)
ggplot(df_broiler, mapping = aes(x = StartDate)) +
geom_histogram(bins = 100) +
scale_x_date(name = "Start Date") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
summary(df_broiler$StartDate)
## Min. 1st Qu. Median Mean 3rd Qu.
## "1980-01-01" "2011-08-18" "2012-09-05" "2012-08-12" "2013-08-23"
## Max. NA's
## "2043-08-22" "1260"
df_broiler %>%
filter(StartDate < "2010-01-01" |
StartDate > "2014-06-08") %>%
select(contains("Date")) %>%
head()
## # A tibble: 6 x 2
## StartDate CompletionDate
## <date> <date>
## 1 2008-10-28 2011-03-01
## 2 2008-09-01 2011-01-10
## 3 1980-01-01 2014-01-08
## 4 1980-01-01 2014-01-08
## 5 1980-01-07 2014-01-08
## 6 2025-04-25 2014-04-25
# get an index vector for the rows with invalid dates
ix <- ((df_broiler$StartDate < "2010-01-01") |
(df_broiler$StartDate > "2014-06-08")) &
!is.na(df_broiler$StartDate)
# replace start date with completion date
df_broiler$StartDate[ix] <- df_broiler$CompletionDate[ix]
# plot new distribution
ggplot(df_broiler) +
geom_bar(mapping = aes(x = format(StartDate, "%Y-%m"), ..count..)) +
scale_x_discrete(name = "Start Date") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
duplicate_cases <- duplicated(df_broiler)
sum(duplicate_cases)
## [1] 1258
df_broiler <- df_broiler[!duplicate_cases, ]
ggplot(df_broiler) +
geom_bar(mapping = aes(x = format(StartDate, "%Y-%m"), ..count..)) +
scale_x_discrete(name = "Start Date") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
df_subset <-
df_broiler %>%
select(
# identifiers
BatchHouseID,
BatchID,
# risk factors
ProducerPostcode,
StartDate,
NAnimals,
PoultryProductionSystem,
PoultryBroilerStockingDensity,
PoultryBroilerBreed,
AgeOfBirds,
CumulativeHouseMortalityPercentage,
FPD0:FPD2,
# outcomes
AscitesOrOedema:TotalRejectionsTriggerIndicator
)
duplicate_cases <- duplicated(df_subset)
sum(duplicate_cases)
## [1] 35730
duplicate_ids <- df_broiler$BatchHouseID[duplicate_cases] %>% unique
df_random_check <- filter(df_broiler, BatchHouseID == sample(duplicate_ids, 1))
ix <- apply(df_random_check, 2, function (x) length(unique(x)) > 1)
df_random_check[, ix]
## # A tibble: 2 x 2
## MHSOfficePhone EstablishmentPhone
## <chr> <chr>
## 1 15485638 27875412
## 2 015485638 027875412
## # A tibble: 2 x 2
## HouseNo MHSOfficePhone
## <chr> <chr>
## 1 41732 92986522
## 2 3/4 092986522
Note that these are fake phonenumbers for the purposes of illustration, hence they don’t fit standard phone number format.
df_broiler <- df_broiler[!duplicate_cases, ]
ggplot(df_broiler) +
geom_bar(mapping = aes(x = format(StartDate, "%Y-%m"), ..count..)) +
scale_x_discrete(name = "Start Date") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
df_subset <- filter(df_broiler,
StartDate > "2013-02-28"
& StartDate < "2013-04-01")
duplicate_cases <-
select(df_subset, BatchHouseID, BatchID, NAnimals) %>%
duplicated()
duplicate_ids <- df_subset$BatchHouseID[duplicate_cases] %>% unique
df_random_check <- filter(df_subset, BatchHouseID == sample(duplicate_ids, 1))
ix <- apply(df_random_check, 2, function (x) length(unique(x)) > 1)
df_random_check[, ix]
## # A tibble: 2 x 6
## ProducerAddress2 HouseNo FPD0 FPD1 FPD2 FPDSwedishScore
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NULL 01-Feb NULL NULL NULL NULL
## 2 <NA> 1-2 <NA> <NA> <NA> <NA>
## # A tibble: 2 x 4
## AscitesOrOedemaPe… DermatitisOrCelluli… DOAPercentage JointLesionsOrArth…
## <dbl> <dbl> <dbl> <dbl>
## 1 0.201 0.0564 0.152 0.00705
## 2 0 0 0 0
anonimise(df_broiler$CPH[200])
## [1] "74/938/2350"
# check which ID's comply with format, allowing for some variation
check_CPH <- str_detect(df_broiler$CPH, "\\d{2}\\D\\d{3}\\D\\d{4}")
# extract those that dont' comply
weird_CPH <- unique(df_broiler$CPH[!check_CPH])
# print off
anonimise(weird_CPH)
## [1] "03/857/753" "48/3025" "48/850/303"
## [4] "822/735/12" "97/542/232" "09/658/"
## [7] "502/638/138" "671/406/29" "65/122/9???"
## [10] "17/86/3695" "uhpghkfw@thlxeis.oby" "l/k"
## [13] "toz itehb" "szwb" "ikwnxst"
## [16] "05/021/106" "87/895/400" "71/65/1565"
## [19] "80/862/70" "26/011/137" "238176741"
## [22] "40/39/878"
Anonimise is a custom function that I wrote to randomly substitute letters for other letters and digits for other digits, for the purposes of this presentation.
BS6 6AA
BS34 5LE
EC1V 9BD
pc_correct <-
str_detect(df_broiler$ProducerPostcode, "^[:alpha:].*\\d[:alpha:]{2}$")
anonimise(unique(df_broiler$ProducerPostcode[!pc_correct]))
## [1] NA "sv4t y" "me62 hi" "ui57 kt" "so" "ox56 uz"
## [7] "lq82 zx" "il87 bn" "pp3" "cm98 iyx" "pq44 cb" "yglg"
## [13] "al9 mgk" "qydt rn" "wq69 zf" "h670 yv" "qc" "mp96 se"
## [19] "hd90 xe" "ps41 rk" "qx62 rz" "he67 kh" "nr66 hm" "sm05 pv"
## [25] "lz05 le" "ew45 yi" "bd52 cuk" "pc53 7o4" "xp59 tf" "nbtu xp"
## [31] "qgdz ax" "fbek is" "zg58 bb" "zt92 ou" "xy69 wt" "cw97 va"
## [37] "ca50 lm" "gg02 zz" "wb12 is" "er86 wx" "6636 5" "ec08 dd"
## [43] "td02 jc" "va73 to" "kc39 lr" "ee41 40y" "uc34 lg"
Anonimise is a custom function that I wrote to randomly substitute letters for other letters and digits for other digits, for the purposes of this presentation.
hist(df_broiler$`Poultry Age`, breaks = 1000)
hist(df_broiler$`Poultry Age`[df_broiler$`Poultry Age` < 200], breaks = 1000)
summary(df_broiler$`Poultry Age`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 33 37 105 40 6472122 24778
df_check <-
df_broiler %>%
filter(`Poultry Age` == `Number Animals`) %>%
select(`Poultry Age`, `Number Animals`)
df_check
## # A tibble: 6 x 2
## `Poultry Age` `Number Animals`
## <dbl> <dbl>
## 1 70 70
## 2 7128 7128
## 3 7128 7128
## 4 7128 7128
## 5 84 84
## 6 100 100
hist(df_broiler$`Poultry Age`, breaks = 1000)
# colour palette
c_palette <- c("#999999",
"#E69F00",
"#56B4E9",
"#009E73",
"#F0E442",
"#0072B2",
"#D55E00",
"#CC79A7")
# plot
ggplot(df_broiler) +
geom_bar(mapping = aes(x = `Poultry Age`,
y = ..count..,
fill = `Poultry Production System`),
alpha = 0.5) +
scale_fill_manual(values = c_palette) +
theme_light()
library(assertr)
Verify your data in the analysis pipeline.
Works very well with %>%
https://cran.r-project.org/web/packages/assertr/vignettes/assertr.html
Example: I want to ensure that only the correct breeds are recorded in the dataset.
breed <- c("Cobb",
"Hubbard",
"Hybro",
"Ross",
"Other")
df_broiler %>%
assert(in_set(breed), `Broiler Breed`) %>%
group_by(`Broiler Breed`) %>%
count()
## Column 'Broiler Breed' violates assertion 'in_set(breed)' 7944 times
## verb redux_fn predicate column index value
## 1 assert NA in_set(breed) Broiler Breed 14 Mixed or Not Provided
## 2 assert NA in_set(breed) Broiler Breed 15 Mixed or Not Provided
## 3 assert NA in_set(breed) Broiler Breed 17 Mixed or Not Provided
## 4 assert NA in_set(breed) Broiler Breed 18 Mixed or Not Provided
## 5 assert NA in_set(breed) Broiler Breed 19 Mixed or Not Provided
## [omitted 7939 rows]
## Warning: assertr encountered errors
Can use your own predicate functions.
E.g. we want the analysis to stop if postcode is false, because location is important for our predictions.
pred_function <- function (x) str_detect(x, ".*\\d[:alpha:]{2}$")
df_broiler %>%
assert(pred_function, `Producer Postcode`) %>%
group_by(`Producer Postcode`) %>%
count()
## Column 'Producer Postcode' violates assertion 'pred_function' 32 times
## verb redux_fn predicate column index value
## 1 assert NA pred_function Producer Postcode 581 ae41 znd
## 2 assert NA pred_function Producer Postcode 582 ae41 znd
## 3 assert NA pred_function Producer Postcode 583 ae41 znd
## 4 assert NA pred_function Producer Postcode 736 ke2 tvx
## 5 assert NA pred_function Producer Postcode 738 ke2 tvx
## [omitted 27 rows]
## Warning: assertr encountered errors
Postcodes here are anonimised again
Other functionalities:
library(validate)
Verify your data in the exploration stage: produces a validation object with methods to interrogate what is wrong with the data.
https://cran.r-project.org/web/packages/validate/vignettes/indicators.html
df_broiler %>%
check_that(AscitesOrOedemaPercentage >= 0,
AscitesOrOedemaPercentage <= 100,
FPDSwedishScore >= 0,
FPDSwedishScore <= 200) %>%
barplot
Other functionalities:
library(pointblank)
Create elaborate data verification reports on one or more databases simultaneously.
Works very well with %>%
What if I want to verify the range of a variable conditional on another?
E.g. the maximum age of a bird varies by whether they are free range farmed or not.
agent <-
create_agent() %>%
focus_on(tbl_name = "df_broiler") %>%
col_vals_lte(column = `Poultry Age`,
value = 60,
preconditions = `Poultry Production System` == "Intensive Indoor") %>%
col_vals_lte(column = `Poultry Age`,
value = 150,
preconditions = `Poultry Production System` == "Free Range") %>%
interrogate(get_problem_rows = TRUE,
get_first_n = 10)
get_row_sample_info(agent)
## step tbl type n_fail n_sampled
## 1 1 df_broiler col_vals_lte 283 10
## 2 2 df_broiler col_vals_lte 25 10
## brief
## 1 Expect that when ``Poultry Production System` == 'Intensive Indoor'`, values in ``Poultry Age`` (computed column) should be > 60
## 2 Expect that when ``Poultry Production System` == 'Free Range'`, values in ``Poultry Age`` (computed column) should be > 150
Inspecting the wrong values for intensive farming
agent %>%
get_row_sample_data(step = 1) %>%
select(`Poultry Age`)
## # A tibble: 10 x 1
## `Poultry Age`
## <dbl>
## 1 43450
## 2 63
## 3 80
## 4 150
## 5 71
## 6 71
## 7 434
## 8 434
## 9 434
## 10 434
Inspecting the wrong values for free range farming
agent %>%
get_row_sample_data(step = 2) %>%
select(`Poultry Age`)
## # A tibble: 10 x 1
## `Poultry Age`
## <dbl>
## 1 1000
## 2 1000
## 3 556
## 4 6472122
## 5 6472122
## 6 5972
## 7 7084
## 8 787
## 9 8178
## 10 560
Other functionalities:
After data cleaning, create a variable codebook with:
Create a logbook tracing every decision (e.g. markdown, readme)
Use this codebook to implement data assertions in analysis pipeline.
R Studio Cheatsheets
Regular Expression tester (e.g. regex101.com)
Some Twitter accounts I learn from:
@dataandme – @WeAreRLadies – @_ColinFay – @romain_francois – @krlmlr – @jdatap – @xvrdm – @ma_salmon – @BecomingDataSci – @robinson_es – @drob – @carroll_jono – @thomasp85 – @hrbrmstr – @juliasilge – @JennyBryan – @opencpu – @Rbloggers