From 350617cb0756a49e7858fe722c73a5a5e6028675 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 16:44:44 +0530 Subject: [PATCH 1/5] feature added: --validate checksum flag --- databusclient/api/download.py | 144 ++++++++++++++++++++++++++++++++++ databusclient/cli.py | 12 ++- 2 files changed, 154 insertions(+), 2 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index ac55faa..7fe69a9 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -13,6 +13,53 @@ ) +def _extract_checksum_from_node(node) -> str | None: + """ + Try to extract a 64-char hex checksum from a JSON-LD file node. + Handles these common shapes: + - checksum or sha256sum fields as plain string + - checksum fields as dict with '@value' + - nested values (recursively search strings for a 64-char hex) + """ + def find_in_value(v): + if isinstance(v, str): + s = v.strip() + if len(s) == 64 and all(c in "0123456789abcdefABCDEF" for c in s): + return s + if isinstance(v, dict): + # common JSON-LD value object + if "@value" in v and isinstance(v["@value"], str): + res = find_in_value(v["@value"]) + if res: + return res + # try all nested dict values + for vv in v.values(): + res = find_in_value(vv) + if res: + return res + if isinstance(v, list): + for item in v: + res = find_in_value(item) + if res: + return res + return None + + # direct keys to try first + for key in ("checksum", "sha256sum", "sha256", "databus:checksum"): + if key in node: + res = find_in_value(node[key]) + if res: + return res + + # fallback: search all values recursively for a 64-char hex string + for v in node.values(): + res = find_in_value(v) + if res: + return res + return None + + + # Hosts that require Vault token based authentication. Central source of truth. VAULT_REQUIRED_HOSTS = { "data.dbpedia.io", @@ -32,6 +79,8 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, + validate_checksum: bool = False, + expected_checksum: str | None = None, ) -> None: """ Download a file from the internet with a progress bar using tqdm. @@ -183,6 +232,26 @@ def _download_file( if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: raise IOError("Downloaded size does not match Content-Length header") + # --- 6. Optional checksum validation --- + if validate_checksum: + # reuse compute_sha256_and_length from webdav extension + try: + from databusclient.extensions.webdav import compute_sha256_and_length + + actual, _ = compute_sha256_and_length(filename) + except Exception: + actual = None + + if expected_checksum is None: + print(f"WARNING: no expected checksum available for {filename}; skipping validation") + elif actual is None: + print(f"WARNING: could not compute checksum for {filename}; skipping validation") + else: + if actual.lower() != expected_checksum.lower(): + raise IOError( + f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" + ) + def _download_files( urls: List[str], @@ -191,6 +260,8 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, + checksums: dict | None = None, ) -> None: """ Download multiple files from the databus. @@ -204,6 +275,9 @@ def _download_files( - client_id: Client ID for token exchange """ for url in urls: + expected = None + if checksums and isinstance(checksums, dict): + expected = checksums.get(url) _download_file( url=url, localDir=localDir, @@ -211,6 +285,8 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + expected_checksum=expected, ) @@ -358,6 +434,7 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False ) -> None: """ Download all files in a databus collection. @@ -382,6 +459,7 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) @@ -392,6 +470,7 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, ) -> None: """ Download all files in a databus artifact version. @@ -406,6 +485,22 @@ def _download_version( """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) + # build url -> checksum mapping from JSON-LD when available + checksums: dict = {} + try: + json_dict = json.loads(json_str) + graph = json_dict.get("@graph", []) + for node in graph: + if node.get("@type") == "Part": + file_uri = node.get("file") + if not isinstance(file_uri, str): + continue + expected = _extract_checksum_from_node(node) + if expected: + checksums[file_uri] = expected + except Exception: + checksums = {} + _download_files( file_urls, localDir, @@ -413,6 +508,8 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + checksums=checksums, ) @@ -424,6 +521,7 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, ) -> None: """ Download files in a databus artifact. @@ -445,6 +543,22 @@ def _download_artifact( print(f"Downloading version: {version_uri}") json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) + # extract checksums for this version + checksums: dict = {} + try: + jd = json.loads(json_str) + graph = jd.get("@graph", []) + for node in graph: + if node.get("@type") == "Part": + file_uri = node.get("file") + if not isinstance(file_uri, str): + continue + expected = _extract_checksum_from_node(node) + if expected: + checksums[file_uri] = expected + except Exception: + checksums = {} + _download_files( file_urls, localDir, @@ -452,6 +566,8 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + checksums=checksums, ) @@ -527,6 +643,7 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, ) -> None: """ Download files in a databus group. @@ -552,6 +669,7 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) @@ -598,6 +716,7 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", + validate_checksum: bool = False ) -> None: """ Download datasets from databus. @@ -638,9 +757,27 @@ def download( databus_key, auth_url, client_id, + validate_checksum=validate_checksum, ) elif file is not None: print(f"Downloading file: {databusURI}") + # Try to fetch expected checksum from the parent Version metadata + expected = None + if validate_checksum: + try: + version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + json_dict = json.loads(json_str) + graph = json_dict.get("@graph", []) + for node in graph: + if node.get("file") == databusURI or node.get("@id") == databusURI: + expected = _extract_checksum_from_node(node) + if expected: + break + except Exception as e: + print(f"WARNING: Could not fetch checksum for single file: {e}") + + # Call the worker to download the single file (passes expected checksum) _download_file( databusURI, localDir, @@ -648,6 +785,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + expected_checksum=expected, ) elif version is not None: print(f"Downloading version: {databusURI}") @@ -658,6 +797,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + expected_checksum=expected, ) elif artifact is not None: print( @@ -671,6 +812,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) elif group is not None and group != "collections": print( @@ -684,6 +826,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) elif account is not None: print("accountId not supported yet") # TODO @@ -709,4 +852,5 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) diff --git a/databusclient/cli.py b/databusclient/cli.py index 069408e..8c70c4e 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -158,6 +158,11 @@ def deploy( show_default=True, help="Client ID for token exchange", ) +@click.option( + "--validate-checksum", + is_flag=True, + help="Validate checksums of downloaded files" +) def download( databusuris: List[str], localdir, @@ -167,7 +172,9 @@ def download( all_versions, authurl, clientid, + validate_checksum, ): + """ Download datasets from databus, optionally using vault access if vault options are provided. """ @@ -181,7 +188,8 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, - ) + validate_checksum=validate_checksum + ) except DownloadAuthError as e: raise click.ClickException(str(e)) @@ -214,4 +222,4 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool) if __name__ == "__main__": - app() + download() From 743c6237a761a4d0625c404bf8955d49e76692e7 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:13:04 +0530 Subject: [PATCH 2/5] refactor: address CodeRabbit review comments --- databusclient/api/download.py | 4 +--- databusclient/cli.py | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 7fe69a9..76ff3c0 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -12,6 +12,7 @@ get_databus_id_parts_from_file_url, ) +from databusclient.extensions.webdav import compute_sha256_and_length def _extract_checksum_from_node(node) -> str | None: """ @@ -236,8 +237,6 @@ def _download_file( if validate_checksum: # reuse compute_sha256_and_length from webdav extension try: - from databusclient.extensions.webdav import compute_sha256_and_length - actual, _ = compute_sha256_and_length(filename) except Exception: actual = None @@ -798,7 +797,6 @@ def download( auth_url=auth_url, client_id=client_id, validate_checksum=validate_checksum, - expected_checksum=expected, ) elif artifact is not None: print( diff --git a/databusclient/cli.py b/databusclient/cli.py index 8c70c4e..420530d 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -173,8 +173,7 @@ def download( authurl, clientid, validate_checksum, -): - +): """ Download datasets from databus, optionally using vault access if vault options are provided. """ @@ -222,4 +221,4 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool) if __name__ == "__main__": - download() + app() From e33ab8cc8f1b5f926ad258cbb860a89701c91341 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:29:58 +0530 Subject: [PATCH 3/5] refactor: address CodeRabbit review comments (2) --- databusclient/api/download.py | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 76ff3c0..e01f3b7 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -238,7 +238,8 @@ def _download_file( # reuse compute_sha256_and_length from webdav extension try: actual, _ = compute_sha256_and_length(filename) - except Exception: + except (OSError, IOError) as e: + print(f"WARNING: error computing checksum for {filename}: {e}") actual = None if expected_checksum is None: @@ -451,6 +452,42 @@ def _download_collection( file_urls = _get_file_download_urls_from_sparql_query( endpoint, query, databus_key=databus_key ) + + # If checksum validation requested, attempt to build url->checksum mapping + # by fetching the Version JSON-LD for each file's version. We group files + # by their version URI to avoid fetching the same metadata repeatedly. + checksums: dict = {} + if validate_checksum: + # Map version_uri -> list of file urls + versions_map: dict = {} + for fu in file_urls: + try: + h, acc, grp, art, ver, f = get_databus_id_parts_from_file_url(fu) + except Exception: + continue + if ver is None: + continue + version_uri = f"https://{h}/{acc}/{grp}/{art}/{ver}" + versions_map.setdefault(version_uri, []).append(fu) + + # Fetch each version's JSON-LD once and extract checksums for its files + for version_uri, urls_in_version in versions_map.items(): + try: + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + jd = json.loads(json_str) + graph = jd.get("@graph", []) + for node in graph: + if node.get("@type") == "Part": + file_uri = node.get("file") + if not isinstance(file_uri, str): + continue + expected = _extract_checksum_from_node(node) + if expected and file_uri in urls_in_version: + checksums[file_uri] = expected + except Exception: + # Best-effort: if fetching a version fails, skip it + continue + _download_files( list(file_urls), localDir, @@ -459,6 +496,7 @@ def _download_collection( auth_url=auth_url, client_id=client_id, validate_checksum=validate_checksum, + checksums=checksums if checksums else None, ) From dc51aa9306dcd649bc555530659be626c014c94c Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:36:26 +0530 Subject: [PATCH 4/5] refactor: address CodeRabbit review comments (3) --- databusclient/api/download.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index e01f3b7..9881533 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -467,6 +467,8 @@ def _download_collection( continue if ver is None: continue + if h is None or acc is None or grp is None or art is None: + continue version_uri = f"https://{h}/{acc}/{grp}/{art}/{ver}" versions_map.setdefault(version_uri, []).append(fu) From 5875a8210395e5376248d8f8a6e21de3f863b8d3 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:54:05 +0530 Subject: [PATCH 5/5] refactor: address CodeRabbit review comments (4) --- databusclient/api/download.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 9881533..4af27c4 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -248,6 +248,8 @@ def _download_file( print(f"WARNING: could not compute checksum for {filename}; skipping validation") else: if actual.lower() != expected_checksum.lower(): + try: os.remove(filename) # delete corrupted file + except OSError: pass raise IOError( f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" ) @@ -878,6 +880,8 @@ def download( # query as argument else: print("QUERY {}", databusURI.replace("\n", " ")) + if validate_checksum: + print("WARNING: Checksum validation is not supported for user-defined SPARQL queries.") if uri_endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") res = _get_file_download_urls_from_sparql_query(