ThinkR-open · VincentGuyader · Jun 14, 2026 · Jun 14, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,13 @@
 Package: datadiff
-Title: Data Validation Based on YAML Rules
-Version: 0.4.6
+Title: Data Validation Based on 'YAML' Rules
+Version: 0.4.7
 Authors@R: c(
     person("Vincent", "Guyader", , "vincent@thinkr.fr", role = c("cre", "aut"),
            comment = c(ORCID = "0000-0003-0671-9270")),
     person("ThinkR", role = "cph"),
     person("Agence technique de l'information sur l'hospitalisation", role = "spn") )
 Description: A comprehensive data validation package that allows comparing
-    datasets using configurable validation rules defined in YAML files.
+    datasets using configurable validation rules defined in 'YAML' files.
     Built on top of the 'pointblank' package for robust data validation, it
     supports exact matching, tolerance-based numeric comparisons, text
     normalization, and row count validation.

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,28 @@
+# datadiff 0.4.7
+
+## Bug fixes
+
+* The lazy HTML report now keeps the genuine failure detail. Printing
+  `result$reponse` (or calling `datadiff_report_html()`) on a failing
+  comparison again shows, for each failing column, the number of failing rows,
+  the offending cells, and the downloadable CSV extract - while still listing
+  every check performed, with the `col_exists` existence checks and the
+  `col_vals_equal` value checks presented as distinct steps. The report is now
+  built on top of the real interrogated agent (which holds the extracts) rather
+  than a counts-only synthesis, so nothing is lost.
+
+* `build_report_agent()` threads `warn_at` / `stop_at` so the report's
+  warn/stop colouring follows the supplied thresholds instead of hard-coded
+  values.
+
+## Documentation
+
+* Vignette and README document `coverage` / `summary` and the lazy report, and
+  are normalised to ASCII.
+* DESCRIPTION quotes software names ('pointblank', 'YAML') per CRAN convention,
+  clearing the "Possibly misspelled words" NOTE.
+* Internal helpers are marked `@noRd` (documentation kept in source, no `.Rd`).
+
 # datadiff 0.4.6
 
 ## New features

diff --git a/R/coverage.R b/R/coverage.R
@@ -47,6 +47,12 @@ build_coverage <- function(tbl, tol_cols, eq_cols,
     n_failed <<- c(n_failed, as.integer(nf))
   }
 
+  # Existence checks: every common (non-type-mismatch) column gets a col_exists
+  # check, distinct from its value check. These pass (the column is present in
+  # both datasets by construction), mirroring a full per-column pointblank run.
+  for (c in c(tol_cols, eq_cols)) {
+    add(c, "col_exists", 1L, 0L)
+  }
   for (c in tol_cols) {
     cnt <- tol_col_counts(tbl, col = c)
     add(c, "tolerance", cnt$n, cnt$n_failed)

diff --git a/R/report.R b/R/report.R
@@ -8,11 +8,119 @@
 # per-column interrogation. The build cost is paid only when the report is
 # actually displayed, and memoized so repeated prints are instant.
 
-# Build a pointblank agent whose report mirrors the coverage table, populating
-# the interrogation result columns directly (no interrogate() scan).
+# Coverage column name that a real validation step maps to (undo the internal
+# __ok / __eq / dummy-column naming used when the agent was built).
+report_underlying_col <- function(col) {
+  if (identical(col, "row_count_ok")) {
+    return("<row_count>")
+  }
+  if (startsWith(col, "__missing_col_")) {
+    return(sub("^__missing_col_", "", col))
+  }
+  if (startsWith(col, "__type_mismatch_")) {
+    return(sub("^__type_mismatch_", "", col))
+  }
+  sub("__(ok|eq)$", "", col)
+}
+
+# Build a pointblank agent whose report mirrors the coverage table.
+#
+# When `real_agent` (the genuinely interrogated agent from the comparison) is
+# supplied, the report is built ON TOP of it so that failing columns keep their
+# REAL data extract (failing rows + CSV download) and the agent stays marked as
+# interrogated. Its validation set is rebuilt to list every coverage row in
+# order: the real step is kept (and relabelled) for columns it validated, and a
+# synthetic passing row is added for the rest. Without `real_agent`, a purely
+# synthetic count-only agent is produced (no extracts).
 build_report_agent <- function(coverage, label, lang = "fr", locale = "fr_FR",
-                                warn_at = 1e-14, stop_at = 1e-14) {
+                                warn_at = 1e-14, stop_at = 1e-14,
+                                real_agent = NULL) {
   n <- nrow(coverage)
+
+  # Augment the real interrogated agent only when it has genuine failures
+  # (real extracts to preserve). On an all-pass comparison the real agent is the
+  # minimal placeholder (a single col_exists step); cloning its template would
+  # mislabel every value check as col_exists, so fall through to the synthetic
+  # col_vals_equal build instead.
+  if (!is.null(real_agent) &&
+      any(real_agent$validation_set$n_failed > 0, na.rm = TRUE)) {
+    rvs <- real_agent$validation_set
+    real_cols <- vapply(seq_len(nrow(rvs)), function(j) {
+      report_underlying_col(rvs$column[[j]][1])
+    }, character(1))
+
+    # Extracts (the failing-row data + CSV) live in `agent$extracts`, a list
+    # keyed by step index `i`. Reindexing the steps means remapping those keys.
+    old_extracts <- real_agent$extracts %||% list()
+    template <- rvs[1, , drop = FALSE]
+    rows <- vector("list", n)
+    new_extracts <- list()
+    for (i in seq_len(n)) {
+      col <- coverage$column[i]
+      # Only value checks (tolerance / equality) map to a real step: that is
+      # where the genuine row-level extract matters. col_exists and the
+      # structural checks (missing_column, type_mismatch, row_count) are
+      # synthesized from coverage - mapping them to a real step would pull in
+      # per-row dummy results (e.g. n_failed = nrow) that contradict coverage.
+      j <- if (coverage$check[i] %in% c("tolerance", "equality")) {
+        match(col, real_cols)
+      } else {
+        NA_integer_
+      }
+      if (!is.na(j)) {
+        # Genuine interrogated step: keep it, and carry its real extract over to
+        # the new step index.
+        row <- rvs[j, , drop = FALSE]
+        old_key <- as.character(rvs$i[j])
+        if (!is.null(old_extracts[[old_key]])) {
+          new_extracts[[as.character(i)]] <- old_extracts[[old_key]]
+        }
+      } else {
+        # Column the targeted agent did not validate (it passed): synthesise a
+        # passing row from the real-row template.
+        row <- template
+        row$eval_error   <- FALSE
+        row$eval_warning <- FALSE
+        row$n        <- as.numeric(coverage$n[i])
+        row$n_passed <- as.numeric(coverage$n[i] - coverage$n_failed[i])
+        row$n_failed <- as.numeric(coverage$n_failed[i])
+        row$f_passed <- if (coverage$n[i] > 0) row$n_passed / coverage$n[i] else 1
+        row$f_failed <- if (coverage$n[i] > 0) coverage$n_failed[i] / coverage$n[i] else 0
+        row$all_passed <- coverage$n_failed[i] == 0L
+        row$warn   <- FALSE
+        row$stop   <- FALSE
+        row$notify <- FALSE
+      }
+      row$column <- list(coverage$column[i])
+      row$label  <- coverage$check[i]
+      rows[[i]] <- row
+    }
+    new_vs <- dplyr::bind_rows(rows)
+    new_vs$i <- seq_len(nrow(new_vs))
+    new_vs$assertion_type <- ifelse(coverage$check == "col_exists",
+                                    "col_exists", "col_vals_equal")
+    # Make every result column authoritative from coverage so the report is
+    # consistent with res$coverage / res$summary and warn_at/stop_at apply
+    # uniformly. The genuine row-level extracts are preserved separately in
+    # new_extracts (keyed by the new step index).
+    np <- coverage$n - coverage$n_failed
+    new_vs$n            <- as.numeric(coverage$n)
+    new_vs$n_passed     <- as.numeric(np)
+    new_vs$n_failed     <- as.numeric(coverage$n_failed)
+    new_vs$f_passed     <- ifelse(coverage$n > 0, np / coverage$n, 1)
+    new_vs$f_failed     <- ifelse(coverage$n > 0, coverage$n_failed / coverage$n, 0)
+    new_vs$all_passed   <- coverage$n_failed == 0L
+    new_vs$warn         <- coverage$n_failed > 0L & new_vs$f_failed >= warn_at
+    new_vs$stop         <- coverage$n_failed > 0L & new_vs$f_failed >= stop_at
+    new_vs$notify       <- rep(FALSE, nrow(new_vs))
+    new_vs$eval_error   <- rep(FALSE, nrow(new_vs))
+    new_vs$eval_warning <- rep(FALSE, nrow(new_vs))
+    real_agent$validation_set <- new_vs
+    real_agent$extracts <- new_extracts
+    return(real_agent)
+  }
+
+  # No real agent: synthetic count-only report.
   dummy_ncol <- max(n, 1L)
   dummy <- as.data.frame(
     matrix(TRUE, nrow = 1L, ncol = dummy_ncol)
@@ -46,6 +154,8 @@ build_report_agent <- function(coverage, label, lang = "fr", locale = "fr_FR",
   vs$warn         <- coverage$n_failed > 0L & vs$f_failed >= warn_at
   vs$stop         <- coverage$n_failed > 0L & vs$f_failed >= stop_at
   vs$notify       <- rep(FALSE, n)
+  vs$assertion_type <- ifelse(coverage$check == "col_exists",
+                              "col_exists", "col_vals_equal")
   agent$validation_set <- vs
   agent
 }
@@ -80,7 +190,8 @@ datadiff_render_report <- function(x) {
         lang     = attr(x, "datadiff_lang") %||% "fr",
         locale   = attr(x, "datadiff_locale") %||% "fr_FR",
         warn_at  = attr(x, "datadiff_warn_at") %||% 1e-14,
-        stop_at  = attr(x, "datadiff_stop_at") %||% 1e-14
+        stop_at  = attr(x, "datadiff_stop_at") %||% 1e-14,
+        real_agent = x
       )
     )
   }
@@ -130,7 +241,8 @@ datadiff_report_html <- function(res, file = NULL) {
     lang     = attr(reponse, "datadiff_lang") %||% "fr",
     locale   = attr(reponse, "datadiff_locale") %||% "fr_FR",
     warn_at  = attr(reponse, "datadiff_warn_at") %||% 1e-14,
-    stop_at  = attr(reponse, "datadiff_stop_at") %||% 1e-14
+    stop_at  = attr(reponse, "datadiff_stop_at") %||% 1e-14,
+    real_agent = reponse
   )
   report <- pointblank::get_agent_report(agent)
   if (!is.null(file)) {

diff --git a/tests/testthat/test-coverage.R b/tests/testthat/test-coverage.R
@@ -17,11 +17,14 @@ test_that("tolerance columns appear once each with correct counts", {
     row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
     ref_suffix = "__reference", na_equal = TRUE
   )
-  expect_equal(cov$column, c("a", "b"))
-  expect_equal(cov$check, c("tolerance", "tolerance"))
-  expect_equal(cov$n, c(3L, 3L))
-  expect_equal(cov$n_failed, c(0L, 1L))
-  expect_equal(cov$status, c("PASS", "FAIL"))
+  tol <- cov[cov$check == "tolerance", ]
+  expect_equal(tol$column, c("a", "b"))
+  expect_equal(tol$n, c(3L, 3L))
+  expect_equal(tol$n_failed, c(0L, 1L))
+  expect_equal(tol$status, c("PASS", "FAIL"))
+  # each column also gets a (passing) col_exists check, distinct from the value check
+  expect_setequal(cov$column[cov$check == "col_exists"], c("a", "b"))
+  expect_true(all(cov$status[cov$check == "col_exists"] == "PASS"))
 })
 
 # --- build_coverage: equality, lazy (__eq) and local (recompute) ------------
@@ -34,10 +37,11 @@ test_that("equality columns use pre-computed __eq when present (lazy path)", {
     row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
     ref_suffix = "__reference", na_equal = TRUE
   )
-  expect_equal(cov$check, "equality")
-  expect_equal(cov$n, 2L)
-  expect_equal(cov$n_failed, 1L)
-  expect_equal(cov$status, "FAIL")
+  eq <- cov[cov$check == "equality", ]
+  expect_equal(eq$n, 2L)
+  expect_equal(eq$n_failed, 1L)
+  expect_equal(eq$status, "FAIL")
+  expect_equal(cov$column[cov$check == "col_exists"], "txt")
 })
 
 test_that("equality columns recompute from raw values when no __eq (local path)", {
@@ -52,9 +56,10 @@ test_that("equality columns recompute from raw values when no __eq (local path)"
     row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
     ref_suffix = "__reference", na_equal = TRUE
   )
-  expect_equal(cov$n, 3L)
-  expect_equal(cov$n_failed, 1L)
-  expect_equal(cov$status, "FAIL")
+  eq <- cov[cov$check == "equality", ]
+  expect_equal(eq$n, 3L)
+  expect_equal(eq$n_failed, 1L)
+  expect_equal(eq$status, "FAIL")
 })
 
 test_that("equality NA semantics follow na_equal (local path)", {
@@ -67,8 +72,10 @@ test_that("equality NA semantics follow na_equal (local path)", {
                           list(check_count = FALSE), TRUE, "__reference", TRUE)
   cov_f <- build_coverage(tbl, character(0), "txt", character(0), character(0),
                           list(check_count = FALSE), TRUE, "__reference", FALSE)
-  expect_equal(cov_t$n_failed, 0L) # NA both sides + na_equal TRUE -> pass
-  expect_equal(cov_f$n_failed, 1L) # na_equal FALSE -> the NA row fails
+  eq_t <- cov_t[cov_t$check == "equality", ]
+  eq_f <- cov_f[cov_f$check == "equality", ]
+  expect_equal(eq_t$n_failed, 0L) # NA both sides + na_equal TRUE -> pass
+  expect_equal(eq_f$n_failed, 1L) # na_equal FALSE -> the NA row fails
 })
 
 # --- build_coverage: structural checks --------------------------------------
@@ -115,7 +122,7 @@ test_that("row_count appears only when check_count is enabled, with right status
 
 # --- exhaustivity & degenerate -----------------------------------------------
 
-test_that("every column appears exactly once across check types", {
+test_that("every column is covered: common columns get col_exists + a value check", {
   tbl <- data.frame(a__ok = c(TRUE, TRUE), b__ok = c(TRUE, FALSE),
                     t__eq = c(TRUE, TRUE))
   cov <- build_coverage(
@@ -124,9 +131,13 @@ test_that("every column appears exactly once across check types", {
     row_validation_info = list(check_count = TRUE), row_count_ok = TRUE,
     ref_suffix = "__reference", na_equal = TRUE
   )
-  value_and_struct <- cov[cov$check != "row_count", "column"]
-  expect_setequal(value_and_struct, c("a", "b", "t", "m", "z"))
-  expect_equal(anyDuplicated(value_and_struct), 0L)
+  # common columns each appear as a col_exists AND a value (tolerance/equality) check
+  expect_setequal(cov$column[cov$check == "col_exists"], c("a", "b", "t"))
+  value <- cov[cov$check %in% c("tolerance", "equality"), "column"]
+  expect_setequal(value, c("a", "b", "t"))
+  # structural checks appear once each
+  expect_equal(cov$column[cov$check == "missing_column"], "m")
+  expect_equal(cov$column[cov$check == "type_mismatch"], "z")
 })
 
 test_that("empty inputs yield a 0-row coverage with the right columns", {
@@ -150,9 +161,10 @@ test_that("summarize_coverage aggregates checks and pass/fail counts", {
     row_validation_info = list(check_count = FALSE), row_count_ok = TRUE,
     ref_suffix = "__reference", na_equal = TRUE
   )
+  # col_exists(a) + col_exists(b) + tolerance(a FAIL) + tolerance(b PASS)
   s <- summarize_coverage(cov)
-  expect_equal(s$n_checks, 2L)
-  expect_equal(s$n_pass, 1L)
+  expect_equal(s$n_checks, 4L)
+  expect_equal(s$n_pass, 3L)
   expect_equal(s$n_fail, 1L)
   expect_false(s$all_passed)
 })
@@ -170,9 +182,10 @@ cov_from <- function(tbl, tol_cols, eq_cols, missing = character(0),
 test_that("print shows an all-PASS roll-up when everything passes", {
   cov <- cov_from(data.frame(a__ok = c(TRUE, TRUE), b__ok = c(TRUE, TRUE)),
                   c("a", "b"), character(0))
+  # 2 col_exists + 2 tolerance, all passing
   out <- paste(capture.output(print(cov)), collapse = "\n")
-  expect_match(out, "2 checks")
-  expect_match(out, "2 PASS")
+  expect_match(out, "4 checks")
+  expect_match(out, "4 PASS")
   expect_match(out, "0 FAIL")
 })
 
@@ -188,7 +201,8 @@ test_that("print lists failing checks first when there are failures", {
 test_that("print handles an all-fail table", {
   cov <- cov_from(data.frame(a__ok = c(FALSE, FALSE)), "a", character(0))
   out <- paste(capture.output(print(cov)), collapse = "\n")
-  expect_match(out, "1 checks - 0 PASS, 1 FAIL")
+  # col_exists(a) passes, tolerance(a) fails
+  expect_match(out, "2 checks - 1 PASS, 1 FAIL")
 })
 
 test_that("print handles an empty coverage table without error", {
@@ -233,5 +247,7 @@ test_that("coverage summary verdict matches all_passed on a failing comparison",
   )
   expect_false(res$all_passed)
   expect_equal(res$summary$all_passed, res$all_passed)
-  expect_equal(res$coverage[res$coverage$column == "a", "status"], "FAIL")
+  cov <- res$coverage
+  expect_equal(cov[cov$column == "a" & cov$check == "tolerance", "status"], "FAIL")
+  expect_equal(cov[cov$column == "a" & cov$check == "col_exists", "status"], "PASS")
 })