Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
Package: datadiff
Title: Data Validation Based on YAML Rules
Version: 0.4.6
Title: Data Validation Based on 'YAML' Rules
Version: 0.4.7
Authors@R: c(
person("Vincent", "Guyader", , "vincent@thinkr.fr", role = c("cre", "aut"),
comment = c(ORCID = "0000-0003-0671-9270")),
person("ThinkR", role = "cph"),
person("Agence technique de l'information sur l'hospitalisation", role = "spn") )
Description: A comprehensive data validation package that allows comparing
datasets using configurable validation rules defined in YAML files.
datasets using configurable validation rules defined in 'YAML' files.
Built on top of the 'pointblank' package for robust data validation, it
supports exact matching, tolerance-based numeric comparisons, text
normalization, and row count validation.
Expand Down
25 changes: 25 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
# datadiff 0.4.7

## Bug fixes

* The lazy HTML report now keeps the genuine failure detail. Printing
`result$reponse` (or calling `datadiff_report_html()`) on a failing
comparison again shows, for each failing column, the number of failing rows,
the offending cells, and the downloadable CSV extract - while still listing
every check performed, with the `col_exists` existence checks and the
`col_vals_equal` value checks presented as distinct steps. The report is now
built on top of the real interrogated agent (which holds the extracts) rather
than a counts-only synthesis, so nothing is lost.

* `build_report_agent()` threads `warn_at` / `stop_at` so the report's
warn/stop colouring follows the supplied thresholds instead of hard-coded
values.

## Documentation

* Vignette and README document `coverage` / `summary` and the lazy report, and
are normalised to ASCII.
* DESCRIPTION quotes software names ('pointblank', 'YAML') per CRAN convention,
clearing the "Possibly misspelled words" NOTE.
* Internal helpers are marked `@noRd` (documentation kept in source, no `.Rd`).

# datadiff 0.4.6

## New features
Expand Down
6 changes: 6 additions & 0 deletions R/coverage.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ build_coverage <- function(tbl, tol_cols, eq_cols,
n_failed <<- c(n_failed, as.integer(nf))
}

# Existence checks: every common (non-type-mismatch) column gets a col_exists
# check, distinct from its value check. These pass (the column is present in
# both datasets by construction), mirroring a full per-column pointblank run.
for (c in c(tol_cols, eq_cols)) {
add(c, "col_exists", 1L, 0L)
}
for (c in tol_cols) {
cnt <- tol_col_counts(tbl, col = c)
add(c, "tolerance", cnt$n, cnt$n_failed)
Expand Down
122 changes: 117 additions & 5 deletions R/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,119 @@
# per-column interrogation. The build cost is paid only when the report is
# actually displayed, and memoized so repeated prints are instant.

# Build a pointblank agent whose report mirrors the coverage table, populating
# the interrogation result columns directly (no interrogate() scan).
# Coverage column name that a real validation step maps to (undo the internal
# __ok / __eq / dummy-column naming used when the agent was built).
report_underlying_col <- function(col) {
if (identical(col, "row_count_ok")) {
return("<row_count>")
}
if (startsWith(col, "__missing_col_")) {
return(sub("^__missing_col_", "", col))
}
if (startsWith(col, "__type_mismatch_")) {
return(sub("^__type_mismatch_", "", col))
}
sub("__(ok|eq)$", "", col)
}

# Build a pointblank agent whose report mirrors the coverage table.
#
# When `real_agent` (the genuinely interrogated agent from the comparison) is
# supplied, the report is built ON TOP of it so that failing columns keep their
# REAL data extract (failing rows + CSV download) and the agent stays marked as
# interrogated. Its validation set is rebuilt to list every coverage row in
# order: the real step is kept (and relabelled) for columns it validated, and a
# synthetic passing row is added for the rest. Without `real_agent`, a purely
# synthetic count-only agent is produced (no extracts).
build_report_agent <- function(coverage, label, lang = "fr", locale = "fr_FR",
warn_at = 1e-14, stop_at = 1e-14) {
warn_at = 1e-14, stop_at = 1e-14,
real_agent = NULL) {
n <- nrow(coverage)

# Augment the real interrogated agent only when it has genuine failures
# (real extracts to preserve). On an all-pass comparison the real agent is the
# minimal placeholder (a single col_exists step); cloning its template would
# mislabel every value check as col_exists, so fall through to the synthetic
# col_vals_equal build instead.
if (!is.null(real_agent) &&
any(real_agent$validation_set$n_failed > 0, na.rm = TRUE)) {
rvs <- real_agent$validation_set
real_cols <- vapply(seq_len(nrow(rvs)), function(j) {
report_underlying_col(rvs$column[[j]][1])
}, character(1))

# Extracts (the failing-row data + CSV) live in `agent$extracts`, a list
# keyed by step index `i`. Reindexing the steps means remapping those keys.
old_extracts <- real_agent$extracts %||% list()
template <- rvs[1, , drop = FALSE]
rows <- vector("list", n)
new_extracts <- list()
for (i in seq_len(n)) {
col <- coverage$column[i]
# Only value checks (tolerance / equality) map to a real step: that is
# where the genuine row-level extract matters. col_exists and the
# structural checks (missing_column, type_mismatch, row_count) are
# synthesized from coverage - mapping them to a real step would pull in
# per-row dummy results (e.g. n_failed = nrow) that contradict coverage.
j <- if (coverage$check[i] %in% c("tolerance", "equality")) {
match(col, real_cols)
} else {
NA_integer_
}
if (!is.na(j)) {
# Genuine interrogated step: keep it, and carry its real extract over to
# the new step index.
row <- rvs[j, , drop = FALSE]
old_key <- as.character(rvs$i[j])
if (!is.null(old_extracts[[old_key]])) {
new_extracts[[as.character(i)]] <- old_extracts[[old_key]]
}
} else {
# Column the targeted agent did not validate (it passed): synthesise a
# passing row from the real-row template.
row <- template
row$eval_error <- FALSE
row$eval_warning <- FALSE
row$n <- as.numeric(coverage$n[i])
row$n_passed <- as.numeric(coverage$n[i] - coverage$n_failed[i])
row$n_failed <- as.numeric(coverage$n_failed[i])
row$f_passed <- if (coverage$n[i] > 0) row$n_passed / coverage$n[i] else 1
row$f_failed <- if (coverage$n[i] > 0) coverage$n_failed[i] / coverage$n[i] else 0
row$all_passed <- coverage$n_failed[i] == 0L
row$warn <- FALSE
row$stop <- FALSE
row$notify <- FALSE
}
row$column <- list(coverage$column[i])
row$label <- coverage$check[i]
rows[[i]] <- row
}
new_vs <- dplyr::bind_rows(rows)
new_vs$i <- seq_len(nrow(new_vs))
new_vs$assertion_type <- ifelse(coverage$check == "col_exists",
"col_exists", "col_vals_equal")
# Make every result column authoritative from coverage so the report is
# consistent with res$coverage / res$summary and warn_at/stop_at apply
# uniformly. The genuine row-level extracts are preserved separately in
# new_extracts (keyed by the new step index).
np <- coverage$n - coverage$n_failed
new_vs$n <- as.numeric(coverage$n)
new_vs$n_passed <- as.numeric(np)
new_vs$n_failed <- as.numeric(coverage$n_failed)
new_vs$f_passed <- ifelse(coverage$n > 0, np / coverage$n, 1)
new_vs$f_failed <- ifelse(coverage$n > 0, coverage$n_failed / coverage$n, 0)
new_vs$all_passed <- coverage$n_failed == 0L
new_vs$warn <- coverage$n_failed > 0L & new_vs$f_failed >= warn_at
new_vs$stop <- coverage$n_failed > 0L & new_vs$f_failed >= stop_at
new_vs$notify <- rep(FALSE, nrow(new_vs))
new_vs$eval_error <- rep(FALSE, nrow(new_vs))
new_vs$eval_warning <- rep(FALSE, nrow(new_vs))
real_agent$validation_set <- new_vs
real_agent$extracts <- new_extracts
return(real_agent)
Comment on lines +98 to +120
}

# No real agent: synthetic count-only report.
dummy_ncol <- max(n, 1L)
dummy <- as.data.frame(
matrix(TRUE, nrow = 1L, ncol = dummy_ncol)
Expand Down Expand Up @@ -46,6 +154,8 @@ build_report_agent <- function(coverage, label, lang = "fr", locale = "fr_FR",
vs$warn <- coverage$n_failed > 0L & vs$f_failed >= warn_at
vs$stop <- coverage$n_failed > 0L & vs$f_failed >= stop_at
vs$notify <- rep(FALSE, n)
vs$assertion_type <- ifelse(coverage$check == "col_exists",
"col_exists", "col_vals_equal")
agent$validation_set <- vs
agent
}
Expand Down Expand Up @@ -80,7 +190,8 @@ datadiff_render_report <- function(x) {
lang = attr(x, "datadiff_lang") %||% "fr",
locale = attr(x, "datadiff_locale") %||% "fr_FR",
warn_at = attr(x, "datadiff_warn_at") %||% 1e-14,
stop_at = attr(x, "datadiff_stop_at") %||% 1e-14
stop_at = attr(x, "datadiff_stop_at") %||% 1e-14,
real_agent = x
)
)
}
Expand Down Expand Up @@ -130,7 +241,8 @@ datadiff_report_html <- function(res, file = NULL) {
lang = attr(reponse, "datadiff_lang") %||% "fr",
locale = attr(reponse, "datadiff_locale") %||% "fr_FR",
warn_at = attr(reponse, "datadiff_warn_at") %||% 1e-14,
stop_at = attr(reponse, "datadiff_stop_at") %||% 1e-14
stop_at = attr(reponse, "datadiff_stop_at") %||% 1e-14,
real_agent = reponse
)
report <- pointblank::get_agent_report(agent)
if (!is.null(file)) {
Expand Down
64 changes: 40 additions & 24 deletions tests/testthat/test-coverage.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ test_that("tolerance columns appear once each with correct counts", {
row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
ref_suffix = "__reference", na_equal = TRUE
)
expect_equal(cov$column, c("a", "b"))
expect_equal(cov$check, c("tolerance", "tolerance"))
expect_equal(cov$n, c(3L, 3L))
expect_equal(cov$n_failed, c(0L, 1L))
expect_equal(cov$status, c("PASS", "FAIL"))
tol <- cov[cov$check == "tolerance", ]
expect_equal(tol$column, c("a", "b"))
expect_equal(tol$n, c(3L, 3L))
expect_equal(tol$n_failed, c(0L, 1L))
expect_equal(tol$status, c("PASS", "FAIL"))
# each column also gets a (passing) col_exists check, distinct from the value check
expect_setequal(cov$column[cov$check == "col_exists"], c("a", "b"))
expect_true(all(cov$status[cov$check == "col_exists"] == "PASS"))
})

# --- build_coverage: equality, lazy (__eq) and local (recompute) ------------
Expand All @@ -34,10 +37,11 @@ test_that("equality columns use pre-computed __eq when present (lazy path)", {
row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
ref_suffix = "__reference", na_equal = TRUE
)
expect_equal(cov$check, "equality")
expect_equal(cov$n, 2L)
expect_equal(cov$n_failed, 1L)
expect_equal(cov$status, "FAIL")
eq <- cov[cov$check == "equality", ]
expect_equal(eq$n, 2L)
expect_equal(eq$n_failed, 1L)
expect_equal(eq$status, "FAIL")
expect_equal(cov$column[cov$check == "col_exists"], "txt")
})

test_that("equality columns recompute from raw values when no __eq (local path)", {
Expand All @@ -52,9 +56,10 @@ test_that("equality columns recompute from raw values when no __eq (local path)"
row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
ref_suffix = "__reference", na_equal = TRUE
)
expect_equal(cov$n, 3L)
expect_equal(cov$n_failed, 1L)
expect_equal(cov$status, "FAIL")
eq <- cov[cov$check == "equality", ]
expect_equal(eq$n, 3L)
expect_equal(eq$n_failed, 1L)
expect_equal(eq$status, "FAIL")
})

test_that("equality NA semantics follow na_equal (local path)", {
Expand All @@ -67,8 +72,10 @@ test_that("equality NA semantics follow na_equal (local path)", {
list(check_count = FALSE), TRUE, "__reference", TRUE)
cov_f <- build_coverage(tbl, character(0), "txt", character(0), character(0),
list(check_count = FALSE), TRUE, "__reference", FALSE)
expect_equal(cov_t$n_failed, 0L) # NA both sides + na_equal TRUE -> pass
expect_equal(cov_f$n_failed, 1L) # na_equal FALSE -> the NA row fails
eq_t <- cov_t[cov_t$check == "equality", ]
eq_f <- cov_f[cov_f$check == "equality", ]
expect_equal(eq_t$n_failed, 0L) # NA both sides + na_equal TRUE -> pass
expect_equal(eq_f$n_failed, 1L) # na_equal FALSE -> the NA row fails
})

# --- build_coverage: structural checks --------------------------------------
Expand Down Expand Up @@ -115,7 +122,7 @@ test_that("row_count appears only when check_count is enabled, with right status

# --- exhaustivity & degenerate -----------------------------------------------

test_that("every column appears exactly once across check types", {
test_that("every column is covered: common columns get col_exists + a value check", {
tbl <- data.frame(a__ok = c(TRUE, TRUE), b__ok = c(TRUE, FALSE),
t__eq = c(TRUE, TRUE))
cov <- build_coverage(
Expand All @@ -124,9 +131,13 @@ test_that("every column appears exactly once across check types", {
row_validation_info = list(check_count = TRUE), row_count_ok = TRUE,
ref_suffix = "__reference", na_equal = TRUE
)
value_and_struct <- cov[cov$check != "row_count", "column"]
expect_setequal(value_and_struct, c("a", "b", "t", "m", "z"))
expect_equal(anyDuplicated(value_and_struct), 0L)
# common columns each appear as a col_exists AND a value (tolerance/equality) check
expect_setequal(cov$column[cov$check == "col_exists"], c("a", "b", "t"))
value <- cov[cov$check %in% c("tolerance", "equality"), "column"]
expect_setequal(value, c("a", "b", "t"))
# structural checks appear once each
expect_equal(cov$column[cov$check == "missing_column"], "m")
expect_equal(cov$column[cov$check == "type_mismatch"], "z")
})

test_that("empty inputs yield a 0-row coverage with the right columns", {
Expand All @@ -150,9 +161,10 @@ test_that("summarize_coverage aggregates checks and pass/fail counts", {
row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
ref_suffix = "__reference", na_equal = TRUE
)
# col_exists(a) + col_exists(b) + tolerance(a FAIL) + tolerance(b PASS)
s <- summarize_coverage(cov)
expect_equal(s$n_checks, 2L)
expect_equal(s$n_pass, 1L)
expect_equal(s$n_checks, 4L)
expect_equal(s$n_pass, 3L)
expect_equal(s$n_fail, 1L)
expect_false(s$all_passed)
})
Expand All @@ -170,9 +182,10 @@ cov_from <- function(tbl, tol_cols, eq_cols, missing = character(0),
test_that("print shows an all-PASS roll-up when everything passes", {
cov <- cov_from(data.frame(a__ok = c(TRUE, TRUE), b__ok = c(TRUE, TRUE)),
c("a", "b"), character(0))
# 2 col_exists + 2 tolerance, all passing
out <- paste(capture.output(print(cov)), collapse = "\n")
expect_match(out, "2 checks")
expect_match(out, "2 PASS")
expect_match(out, "4 checks")
expect_match(out, "4 PASS")
expect_match(out, "0 FAIL")
})

Expand All @@ -188,7 +201,8 @@ test_that("print lists failing checks first when there are failures", {
test_that("print handles an all-fail table", {
cov <- cov_from(data.frame(a__ok = c(FALSE, FALSE)), "a", character(0))
out <- paste(capture.output(print(cov)), collapse = "\n")
expect_match(out, "1 checks - 0 PASS, 1 FAIL")
# col_exists(a) passes, tolerance(a) fails
expect_match(out, "2 checks - 1 PASS, 1 FAIL")
})

test_that("print handles an empty coverage table without error", {
Expand Down Expand Up @@ -233,5 +247,7 @@ test_that("coverage summary verdict matches all_passed on a failing comparison",
)
expect_false(res$all_passed)
expect_equal(res$summary$all_passed, res$all_passed)
expect_equal(res$coverage[res$coverage$column == "a", "status"], "FAIL")
cov <- res$coverage
expect_equal(cov[cov$column == "a" & cov$check == "tolerance", "status"], "FAIL")
expect_equal(cov[cov$column == "a" & cov$check == "col_exists", "status"], "PASS")
})
Loading
Loading