Rotoiti QC Summary

Load libraries

library(dplyr)
library(tidyr)
library(readr)
library(lubridate)
library(ggplot2)
library(scattermore)
library(DT)
library(patchwork)

# Set timezone to NZST
withr::local_locale(c("LC_TIME" = "C"))
withr::local_timezone("Etc/GMT-12")

# Load functions
source("R/qc_funs.R")

1 Download data from GitHub

piggyback::pb_download(
  file = "rotoiti_data_qc.zip",
  dest = ".",
  repo = "limnotrack/f_rotoiti",
  tag = "v0.0.1"
)

# Unzip the file
unzip("rotoiti_data_qc.zip")

2 Read data

path <- "rotoiti_data_qc"

site <- read_csv("rotoiti_data_qc/sites.csv", col_types = cols())
site_events <- read_csv("rotoiti_data_qc/site_events.csv", col_types = cols())
site_devices <- read_csv("rotoiti_data_qc/site_devices.csv", col_types = cols())
device_var <- read_csv("rotoiti_data_qc/device_variable.csv", col_types = cols())
device_position <- read_csv("rotoiti_data_qc/device_position.csv", 
                            col_types = cols())
sensor_reference <- read_csv("rotoiti_data_qc/sensor_reference.csv",
                             col_types = cols())
sensor_calibrations <- read_csv("rotoiti_data_qc/sensor_calibrations.csv", 
                                col_types = cols())
sensor_scaling <- read_csv("rotoiti_data_qc/sensor_scaling.csv",
                           col_types = cols())
variable_ref <- read_csv("rotoiti_data_qc/variables.csv", col_types = cols())
qc_filters <- read_csv("rotoiti_data_qc/qc_filters.csv", col_types = cols())

data_wide <- read_csv("rotoiti_data_qc/rotoiti_qc.csv", col_types = cols())

Pivot the data to long format and map site devices.

data <- data_wide |> 
  pivot_longer(
    cols = matches("^(qc_value|qc_code|qc_flag)_"),
    names_to = c(".value", "var_ref_id"),
    names_pattern = "^(qc_value|qc_code|qc_flag)_(.+)$"
  )

# Map site devices to data
data <- data |> 
  map_data_to_devices(site_devices = site_devices,
                      device_var = device_var,
                      device_position = device_position,
                      variables = variable_ref
                      ) 
head(data)

# A tibble: 6 × 9
  site  device_id var_abbr var_ref_id label datetime            qc_value qc_code
  <chr> <chr>     <chr>    <chr>      <chr> <dttm>                 <dbl> <chr>  
1 f_Ro… AA_devic… c_do     c_do_d100  Diss… 2007-01-12 05:00:00       NA QC 200 
2 f_Ro… AA_devic… c_do     c_do_d100  Diss… 2007-01-12 05:00:00       NA QC 200 
3 f_Ro… AA_devic… c_do     c_do_d100  Diss… 2007-01-12 05:15:00       NA QC 200 
4 f_Ro… AA_devic… c_do     c_do_d100  Diss… 2007-01-12 05:15:00       NA QC 200 
5 f_Ro… AA_devic… c_do     c_do_d100  Diss… 2007-01-12 05:30:00       NA QC 200 
6 f_Ro… AA_devic… c_do     c_do_d100  Diss… 2007-01-12 05:30:00       NA QC 200 
# ℹ 1 more variable: qc_flag <chr>

3 Summarise by variable

Which variables are in the dataset?

Here is the list of variables in the dataset. The var_ref_id is the variable reference ID, which is used to identify the variable in the dataset. The var_abbr is the variable abbreviation, which is used to identify the variable in the metadata. The label is the variable label, which is used to identify the variable in the metadata. The reference is the reference value for the variable, it can be “d” (depth), “h” (height) or “e” elevation and the value_m is the value in meters.

data |> 
  select(var_ref_id) |> 
  distinct() |> 
  mutate(decode_var_ref(var_ref_id)) |>
  left_join(variable_ref, by = c("var_abbr" = "abbr")) |> 
  select(label, var_abbr, var_ref_id, reference, z_relative) |> 
  datatable(rownames = FALSE,
            options = list(
              pageLength = 6,
              # dom = "t",
              columnDefs = list(list(className = 'dt-center', targets = "_all"))
            )
  )

Table 1

data |> 
  group_by(label) |> 
  summarise(
    min = min(qc_value, na.rm = TRUE),
    max = max(qc_value, na.rm = TRUE),
    median = median(qc_value, na.rm = TRUE),
    mean = mean(qc_value, na.rm = TRUE),
    sd = sd(qc_value, na.rm = TRUE),
    n = n(),
    na_pct = 100 * sum(is.na(qc_value)) / n(),
    qc_pct = 100 * sum(qc_code %in% c("QC 300", "QC 400",
                                      "QC 500", "QC 600")) / n(),
  ) |> 
  # Round everything to 1
  mutate(across(where(is.numeric), ~ round(.x, 1))) |> 
  DT::datatable()

4 Quality control

We used a set of quality control codes to assess the quality of the data. The codes are from the National Environmental Monitoring Standards (NEMS). The codes are as follows:

Table 2: Quality control codes used in the dataset.

5 Visualise each variable

The plots below show the A) time series of the data, B) the quality codes (QC) attributed to each data point, C) the device used to measure the data.

5.1 Temperature

plot_var_ts_qc(data = data, var_ref_id = c("t_wtr_d50", "t_wtr_d200",
                                           "t_wtr_d400", "t_wtr_d600",
                                           "t_wtr_d900", "t_wtr_d1200", 
                                           "t_wtr_d1500", "t_wtr_d1800", 
                                           "t_wtr_d1810",  "t_wtr_d2100"))

Figure 1: Temperature data at various depths quality control.

5.2 Oxygen saturation

plot_var_ts_qc(data = data, var_ref_id = c("c_do_sat_d100", "c_do_sat_d1800"))

Figure 2: Oxygen saturation data at 1 and 18m depths quality control.

5.3 Oxygen concentration

plot_var_ts_qc(data = data, var_ref_id = c("c_do_d100", "c_do_d1800"))

Figure 3: Oxygen data at 1 and 18m depths quality control.

5.4 Chlorophyll

plot_var_ts_qc(data = data, var_ref_id = c("f_chl_d100"))

Figure 4: Chlorophyll data at 1.0m quality control.

6 Download data

The data used in this analysis is available for download. The data is in CSV format and is zipped. The data is in the rotoiti_data_qc.zip folder. The data can be downloaded either from GitHub or from the button below.

Download Data