Data Validation in R

Bobby Stuijfzand

14 May 2019 - R Lunch

Introduction

Aims of talk

  • Cleaning adminstrative data to be used in research
  • Demonstrate data validation packages

Clean data: a working definition

The data are what they say they are

Case: Broilers

Context

# import cleaned data
var_types <- "ccccDiicccidddddididididididididididididididididididddddc"

df_broiler <- read_csv(data_path, col_types = var_types)

# how many batches?
nrow(df_broiler) %>% format(big.mark = ",")
## [1] "571,276"
# find out how many animals?
sum(df_broiler$n_animals, na.rm = TRUE) %>% format(big.mark = ",")
## [1] "3,994,373,939"

Importing data

Importing data

Twitter

Different number of columns

# get filenames and load the datafiles into a list
file_names <- list.files(data_path, pattern = "^Broiler.*\\.csv$") 

df_broiler <- map(file.path(data_path, file_names), read_csv)
names(df_broiler) <- file_names

# print number of columns for each data file
map_dbl(df_broiler, ncol)
##    Broiler 2010.csv    Broiler 2011.csv    Broiler 2012.csv 
##                  57                  57                  57 
## Broiler 2013-14.csv 
##                  56

Different number of columns (2)

# get all unique variable names across all dataframes
var_names <- df_broiler %>% map(names) %>% unlist %>% unique

# for each dataframe, check if each variable actually exists, and if not, print
# it out.
map(df_broiler,
    function (x){
      ix <- 
        var_names %>% 
        map_lgl(function (y) y %in% names(x) == FALSE) %>% 
        which

      var_names[ix]
    })
## $`Broiler 2010.csv`
## character(0)
## 
## $`Broiler 2011.csv`
## character(0)
## 
## $`Broiler 2012.csv`
## character(0)
## 
## $`Broiler 2013-14.csv`
## [1] "AnimalHealthReport"

Different classes

var_class <- 
  map_df(df_broiler, 
         function (x) {
           map_chr(x, class)
         })

ix <- apply(var_class, 1, function (x) length(unique(x)) > 1)

bind_cols(c(names(df_broiler[[1]])[which(ix)], var_class[which(ix), ]))
## # A tibble: 1 x 5
##   V1    `Broiler 2010.c… `Broiler 2011.c… `Broiler 2012.c… `Broiler 2013-1…
##   <chr> <chr>            <chr>            <chr>            <chr>           
## 1 Corr… character        numeric          numeric          numeric
str(df_broiler$`Broiler 2010.csv`$Correction)
##  chr [1:51028] "NULL" "NULL" "NULL" "NULL" "NULL" "NULL" "NULL" "NULL" ...
df_broiler$`Broiler 2010.csv`$Correction <-
  as.integer(df_broiler$`Broiler 2010.csv`$Correction)

Combine data

# combine list with dataframes into a single dataframe
df_broiler <- bind_rows(df_broiler)

Data Cleaning: Complete Data?

Distribution of data over time (1)

ggplot(df_broiler, mapping = aes(x = StartDate)) +
  geom_histogram(bins = 100) +
  scale_x_date(name = "Start Date") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Distribution of data over time (2)

summary(df_broiler$StartDate)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "1980-01-01" "2011-08-18" "2012-09-05" "2012-08-12" "2013-08-23" 
##         Max.         NA's 
## "2043-08-22"       "1260"
df_broiler %>% 
  filter(StartDate < "2010-01-01" | 
           StartDate > "2014-06-08") %>% 
  select(contains("Date")) %>% 
  head()
## # A tibble: 6 x 2
##   StartDate  CompletionDate
##   <date>     <date>        
## 1 2008-10-28 2011-03-01    
## 2 2008-09-01 2011-01-10    
## 3 1980-01-01 2014-01-08    
## 4 1980-01-01 2014-01-08    
## 5 1980-01-07 2014-01-08    
## 6 2025-04-25 2014-04-25

Distribution of data over time (3)

# get an index vector for the rows with invalid dates
ix <- ((df_broiler$StartDate < "2010-01-01") |
      (df_broiler$StartDate > "2014-06-08")) &
      !is.na(df_broiler$StartDate)

# replace start date with completion date
df_broiler$StartDate[ix] <- df_broiler$CompletionDate[ix]

# plot new distribution
ggplot(df_broiler) + 
  geom_bar(mapping = aes(x = format(StartDate, "%Y-%m"), ..count..)) +
  scale_x_discrete(name = "Start Date") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Distribution of data over time (4)

duplicate_cases <- duplicated(df_broiler)
sum(duplicate_cases)
## [1] 1258
df_broiler <- df_broiler[!duplicate_cases, ]

ggplot(df_broiler) + 
  geom_bar(mapping = aes(x = format(StartDate, "%Y-%m"), ..count..)) +
  scale_x_discrete(name = "Start Date") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Duplicate spotting

df_subset <-
  df_broiler %>%
  select(
    # identifiers
    BatchHouseID,
    BatchID,
    # risk factors
    ProducerPostcode,
    StartDate,
    NAnimals,
    PoultryProductionSystem,
    PoultryBroilerStockingDensity,
    PoultryBroilerBreed,
    AgeOfBirds,
    CumulativeHouseMortalityPercentage,
    FPD0:FPD2,
    # outcomes
    AscitesOrOedema:TotalRejectionsTriggerIndicator
  )

duplicate_cases <- duplicated(df_subset)
sum(duplicate_cases)
## [1] 35730

Duplicate spotting (2)

duplicate_ids <- df_broiler$BatchHouseID[duplicate_cases] %>% unique

df_random_check <- filter(df_broiler, BatchHouseID == sample(duplicate_ids, 1))

ix <- apply(df_random_check, 2, function (x) length(unique(x)) > 1)
df_random_check[, ix]
## # A tibble: 2 x 2
##   MHSOfficePhone EstablishmentPhone
##   <chr>          <chr>             
## 1 15485638       27875412          
## 2 015485638      027875412
## # A tibble: 2 x 2
##   HouseNo MHSOfficePhone
##   <chr>   <chr>         
## 1 41732   92986522      
## 2 3/4     092986522

Note that these are fake phonenumbers for the purposes of illustration, hence they don’t fit standard phone number format.

Duplicate spotting (3)

df_broiler <- df_broiler[!duplicate_cases, ]

ggplot(df_broiler) + 
  geom_bar(mapping = aes(x = format(StartDate, "%Y-%m"), ..count..)) +
  scale_x_discrete(name = "Start Date") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Duplicate spotting (4)

df_subset <- filter(df_broiler,
                    StartDate > "2013-02-28"
                    & StartDate < "2013-04-01")

duplicate_cases <- 
  select(df_subset, BatchHouseID, BatchID, NAnimals) %>% 
  duplicated()

duplicate_ids <- df_subset$BatchHouseID[duplicate_cases] %>% unique
df_random_check <- filter(df_subset, BatchHouseID == sample(duplicate_ids, 1))
ix <- apply(df_random_check, 2, function (x) length(unique(x)) > 1)
df_random_check[, ix]
## # A tibble: 2 x 6
##   ProducerAddress2 HouseNo FPD0  FPD1  FPD2  FPDSwedishScore
##   <chr>            <chr>   <chr> <chr> <chr> <chr>          
## 1 NULL             01-Feb  NULL  NULL  NULL  NULL           
## 2 <NA>             1-2     <NA>  <NA>  <NA>  <NA>
## # A tibble: 2 x 4
##   AscitesOrOedemaPe… DermatitisOrCelluli… DOAPercentage JointLesionsOrArth…
##                <dbl>                <dbl>         <dbl>               <dbl>
## 1              0.201               0.0564         0.152             0.00705
## 2              0                   0              0                 0

Final duplicates check

String Data Quality Checks

Farm ID

anonimise(df_broiler$CPH[200])
## [1] "74/938/2350"
# check which ID's comply with format, allowing for some variation
check_CPH <- str_detect(df_broiler$CPH, "\\d{2}\\D\\d{3}\\D\\d{4}")

# extract those that dont' comply
weird_CPH <- unique(df_broiler$CPH[!check_CPH])

# print off
anonimise(weird_CPH)
##  [1] "03/857/753"           "48/3025"              "48/850/303"          
##  [4] "822/735/12"           "97/542/232"           "09/658/"             
##  [7] "502/638/138"          "671/406/29"           "65/122/9???"         
## [10] "17/86/3695"           "uhpghkfw@thlxeis.oby" "l/k"                 
## [13] "toz itehb"            "szwb"                 "ikwnxst"             
## [16] "05/021/106"           "87/895/400"           "71/65/1565"          
## [19] "80/862/70"            "26/011/137"           "238176741"           
## [22] "40/39/878"

Anonimise is a custom function that I wrote to randomly substitute letters for other letters and digits for other digits, for the purposes of this presentation.

Check Postcodes

BS6 6AA

BS34 5LE

EC1V 9BD

pc_correct <- 
  str_detect(df_broiler$ProducerPostcode, "^[:alpha:].*\\d[:alpha:]{2}$")
anonimise(unique(df_broiler$ProducerPostcode[!pc_correct]))
##  [1] NA         "sv4t y"   "me62 hi"  "ui57 kt"  "so"       "ox56 uz" 
##  [7] "lq82 zx"  "il87 bn"  "pp3"      "cm98 iyx" "pq44 cb"  "yglg"    
## [13] "al9 mgk"  "qydt rn"  "wq69 zf"  "h670 yv"  "qc"       "mp96 se" 
## [19] "hd90 xe"  "ps41 rk"  "qx62 rz"  "he67 kh"  "nr66 hm"  "sm05 pv" 
## [25] "lz05 le"  "ew45 yi"  "bd52 cuk" "pc53 7o4" "xp59 tf"  "nbtu xp" 
## [31] "qgdz ax"  "fbek is"  "zg58 bb"  "zt92 ou"  "xy69 wt"  "cw97 va" 
## [37] "ca50 lm"  "gg02 zz"  "wb12 is"  "er86 wx"  "6636 5"   "ec08 dd" 
## [43] "td02 jc"  "va73 to"  "kc39 lr"  "ee41 40y" "uc34 lg"

Anonimise is a custom function that I wrote to randomly substitute letters for other letters and digits for other digits, for the purposes of this presentation.

Numeric Data Quality Checks

Age of birds (1)

hist(df_broiler$`Poultry Age`, breaks = 1000)

Age of birds (2)

hist(df_broiler$`Poultry Age`[df_broiler$`Poultry Age` < 200], breaks = 1000)

summary(df_broiler$`Poultry Age`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0      33      37     105      40 6472122   24778

Age of birds (3)

df_check <- 
  df_broiler %>% 
  filter(`Poultry Age` == `Number Animals`) %>% 
  select(`Poultry Age`, `Number Animals`)
df_check
## # A tibble: 6 x 2
##   `Poultry Age` `Number Animals`
##           <dbl>            <dbl>
## 1            70               70
## 2          7128             7128
## 3          7128             7128
## 4          7128             7128
## 5            84               84
## 6           100              100

Age of birds (4)

hist(df_broiler$`Poultry Age`, breaks = 1000)

Age of birds (5)

# colour palette
c_palette <- c("#999999", 
               "#E69F00", 
               "#56B4E9", 
               "#009E73", 
               "#F0E442", 
               "#0072B2", 
               "#D55E00", 
               "#CC79A7")

# plot
ggplot(df_broiler) +
  geom_bar(mapping = aes(x = `Poultry Age`,
                         y = ..count..,
                         fill = `Poultry Production System`),
           alpha = 0.5) +
  scale_fill_manual(values = c_palette) +
  theme_light()

Age of birds (6)

Data validation packages

assertr

library(assertr)

Verify your data in the analysis pipeline.

Works very well with %>%

https://cran.r-project.org/web/packages/assertr/vignettes/assertr.html

assertr (2)

Example: I want to ensure that only the correct breeds are recorded in the dataset.

breed <- c("Cobb",
           "Hubbard",
           "Hybro",
           "Ross",
           "Other")

df_broiler %>% 
  assert(in_set(breed), `Broiler Breed`) %>% 
  group_by(`Broiler Breed`) %>% 
  count()
## Column 'Broiler Breed' violates assertion 'in_set(breed)' 7944 times
##     verb redux_fn     predicate        column index                 value
## 1 assert       NA in_set(breed) Broiler Breed    14 Mixed or Not Provided
## 2 assert       NA in_set(breed) Broiler Breed    15 Mixed or Not Provided
## 3 assert       NA in_set(breed) Broiler Breed    17 Mixed or Not Provided
## 4 assert       NA in_set(breed) Broiler Breed    18 Mixed or Not Provided
## 5 assert       NA in_set(breed) Broiler Breed    19 Mixed or Not Provided
##   [omitted 7939 rows]
## Warning: assertr encountered errors

assertr (3)

Can use your own predicate functions.

E.g. we want the analysis to stop if postcode is false, because location is important for our predictions.

pred_function <- function (x) str_detect(x, ".*\\d[:alpha:]{2}$")

df_broiler %>% 
  assert(pred_function, `Producer Postcode`) %>% 
  group_by(`Producer Postcode`) %>% 
  count()
## Column 'Producer Postcode' violates assertion 'pred_function' 32 times
##     verb redux_fn     predicate            column index    value
## 1 assert       NA pred_function Producer Postcode   581 ae41 znd
## 2 assert       NA pred_function Producer Postcode   582 ae41 znd
## 3 assert       NA pred_function Producer Postcode   583 ae41 znd
## 4 assert       NA pred_function Producer Postcode   736  ke2 tvx
## 5 assert       NA pred_function Producer Postcode   738  ke2 tvx
##   [omitted 27 rows]
## Warning: assertr encountered errors

Postcodes here are anonimised again

assertr (4)

Other functionalities:

  • insist: evaluate column conditional on characteristics of column itself (e.g. observations within 2 SD’s from mean)
  • rowwise operations: to detect multivariate outliers
  • multiple assertions: just add them to the pipeline

validate

library(validate)

Verify your data in the exploration stage: produces a validation object with methods to interrogate what is wrong with the data.

https://cran.r-project.org/web/packages/validate/vignettes/indicators.html

validate (2)

df_broiler %>%
  check_that(AscitesOrOedemaPercentage >= 0,
             AscitesOrOedemaPercentage <= 100, 
             FPDSwedishScore >= 0,
             FPDSwedishScore <= 200) %>% 
  barplot

validate (3)

Other functionalities:

  • auxiliary package errorlocate to help detect errors and provide suggestions for replacements.
  • can validate combinations of variables (e.g. the ratio between two variables)

pointblank

library(pointblank)

Create elaborate data verification reports on one or more databases simultaneously.

Works very well with %>%

https://github.com/rich-iannone/pointblank

pointblank (2)

What if I want to verify the range of a variable conditional on another?

E.g. the maximum age of a bird varies by whether they are free range farmed or not.

agent <- 
  create_agent() %>% 
  focus_on(tbl_name = "df_broiler") %>% 
  col_vals_lte(column = `Poultry Age`,
               value = 60,
               preconditions = `Poultry Production System` == "Intensive Indoor") %>% 
  col_vals_lte(column = `Poultry Age`,
               value = 150,
               preconditions = `Poultry Production System` == "Free Range") %>% 
  interrogate(get_problem_rows = TRUE,
              get_first_n = 10)

get_row_sample_info(agent)
##   step        tbl         type n_fail n_sampled
## 1    1 df_broiler col_vals_lte    283        10
## 2    2 df_broiler col_vals_lte     25        10
##                                                                                                                              brief
## 1 Expect that when ``Poultry Production System` == 'Intensive Indoor'`, values in ``Poultry Age`` (computed column) should be > 60
## 2      Expect that when ``Poultry Production System` == 'Free Range'`, values in ``Poultry Age`` (computed column) should be > 150

pointblank (3)

Inspecting the wrong values for intensive farming

agent %>%
  get_row_sample_data(step = 1) %>% 
  select(`Poultry Age`)
## # A tibble: 10 x 1
##    `Poultry Age`
##            <dbl>
##  1         43450
##  2            63
##  3            80
##  4           150
##  5            71
##  6            71
##  7           434
##  8           434
##  9           434
## 10           434

pointblank (4)

Inspecting the wrong values for free range farming

agent %>%
  get_row_sample_data(step = 2) %>% 
  select(`Poultry Age`)
## # A tibble: 10 x 1
##    `Poultry Age`
##            <dbl>
##  1          1000
##  2          1000
##  3           556
##  4       6472122
##  5       6472122
##  6          5972
##  7          7084
##  8           787
##  9          8178
## 10           560

pointblank (5)

Other functionalities:

  • can do multiple dataframes in one pipeline and thus one report
  • good for collaborations: generate html / markdown report and email it (or push it to Slack)

Wrapping Up

Codebook

After data cleaning, create a variable codebook with:

  • Variable names
  • Variable labels
  • Original variable names (if applicable)
  • Data types
  • If categorical: categories
  • If numerical: range

Create a logbook tracing every decision (e.g. markdown, readme)

Use this codebook to implement data assertions in analysis pipeline.

Useful resources

R Studio Cheatsheets

  • Working with Strings
  • R Markdown Cheatsheet

Regular Expression tester (e.g. regex101.com)

Some Twitter accounts I learn from:

@dataandme – @WeAreRLadies – @_ColinFay – @romain_francois – @krlmlr – @jdatap – @xvrdm – @ma_salmon – @BecomingDataSci – @robinson_es – @drob – @carroll_jono – @thomasp85 – @hrbrmstr – @juliasilge – @JennyBryan – @opencpu – @Rbloggers

Thank you!