From 3ee3129054e6efbc210425c8c594006ebda1757a Mon Sep 17 00:00:00 2001 From: mvanhorn Date: Wed, 17 Jun 2026 07:49:51 -0700 Subject: [PATCH] fix: Reject filesystem paths in HTTP_CALLS route extraction so client routes can join server routes Signed-off-by: mvanhorn --- internal/cbm/service_patterns.c | 185 ++++++++++++++++++++++++++++++++ src/pipeline/pass_route_nodes.c | 21 ++++ tests/test_infrascan.c | 124 +++++++++++++++++++-- 3 files changed, 321 insertions(+), 9 deletions(-) diff --git a/internal/cbm/service_patterns.c b/internal/cbm/service_patterns.c index 14ba30a6..053fe9b2 100644 --- a/internal/cbm/service_patterns.c +++ b/internal/cbm/service_patterns.c @@ -550,6 +550,191 @@ static const lib_pattern_t *match_qn(const char *qn, const lib_pattern_t *patter return NULL; } +static bool starts_with_segment(const char *path, const char *segment) { + if (!path || path[0] != '/' || !segment) { + return false; + } + size_t seg_len = strlen(segment); + const char *p = path + 1; + return strncmp(p, segment, seg_len) == 0 && (p[seg_len] == '\0' || p[seg_len] == '/'); +} + +static bool contains_segment(const char *path, const char *segment) { + if (!path || !segment) { + return false; + } + size_t seg_len = strlen(segment); + const char *p = path; + while ((p = strchr(p, '/')) != NULL) { + p++; + if (strncmp(p, segment, seg_len) == 0 && (p[seg_len] == '\0' || p[seg_len] == '/')) { + return true; + } + } + return false; +} + +static bool is_digit_char(char ch) { + return ch >= '0' && ch <= '9'; +} + +static bool has_http_route_marker(const char *path) { + if (starts_with_segment(path, "api") || starts_with_segment(path, "apis") || + starts_with_segment(path, "graphql") || starts_with_segment(path, "health") || + starts_with_segment(path, "metrics")) { + return true; + } + return path && path[0] == '/' && path[1] == 'v' && is_digit_char(path[2]) && + (path[3] == '\0' || path[3] == '/'); +} + +static bool has_filesystem_root(const char *path) { + static const char *const roots[] = {"etc", "root", "var", "usr", "home", "tmp", + "private", "opt", "bin", "sbin", "dev", "proc", + "sys", "run", "lib", "lib64", "mnt", "media", + "boot", "srv", "Users", "Volumes", NULL}; + for (int i = 0; roots[i]; i++) { + if (starts_with_segment(path, roots[i])) { + return true; + } + } + return false; +} + +static bool has_hidden_config_segment(const char *path) { + static const char *const segments[] = {".aws", ".azure", ".config", ".docker", ".env", + ".git", ".gnupg", ".kube", ".ssh", NULL}; + for (int i = 0; segments[i]; i++) { + if (contains_segment(path, segments[i])) { + return true; + } + } + return false; +} + +static bool path_ext_matches(const char *ext, const char *wanted) { + return ext && wanted && strcmp(ext, wanted) == 0; +} + +static bool has_filesystem_extension(const char *path) { + if (!path) { + return false; + } + const char *end = strpbrk(path, "?#"); + if (!end) { + end = path + strlen(path); + } + const char *last_slash = path; + for (const char *p = path; p < end; p++) { + if (*p == '/') { + last_slash = p; + } + } + const char *dot = NULL; + for (const char *p = last_slash + 1; p < end; p++) { + if (*p == '.') { + dot = p; + } + } + if (!dot || dot == end - 1) { + return false; + } + char ext[32]; + size_t ext_len = (size_t)(end - dot); + if (ext_len >= sizeof(ext)) { + return false; + } + memcpy(ext, dot, ext_len); + ext[ext_len] = '\0'; + + static const char *const hard_file_exts[] = { + ".cfg", ".conf", ".credentials", ".crt", ".db", ".env", + ".ini", ".key", ".pem", ".pid", ".properties", ".service", + ".sock", ".socket", ".sqlite", ".toml", NULL}; + for (int i = 0; hard_file_exts[i]; i++) { + if (path_ext_matches(ext, hard_file_exts[i])) { + return true; + } + } + if ((path_ext_matches(ext, ".json") || path_ext_matches(ext, ".yaml") || + path_ext_matches(ext, ".yml") || path_ext_matches(ext, ".xml")) && + !has_http_route_marker(path)) { + return true; + } + return false; +} + +static bool callee_is_delimiter_or_filesystem_builder(const char *callee_name) { + if (!callee_name) { + return false; + } + const char *last_dot = strrchr(callee_name, '.'); + const char *last_colon = strstr(callee_name, "::"); + const char *method = callee_name; + if (last_dot && last_dot[1]) { + method = last_dot + 1; + } + if (last_colon && last_colon[2]) { + method = last_colon + 2; + } + if (strcmp(method, "split") == 0 || strcmp(method, "rsplit") == 0 || + strcmp(method, "partition") == 0 || strcmp(method, "join") == 0) { + return true; + } + return strstr(callee_name, "os.path.join") != NULL || strstr(callee_name, "path.join") != NULL; +} + +static const char *strip_string_delimiters(const char *literal, char *buf, size_t buf_sz) { + if (!literal || !literal[0]) { + return NULL; + } + const char *start = literal; + while (*start == ' ' || *start == '\t' || *start == '\n' || *start == '\r') { + start++; + } + size_t len = strlen(start); + while (len > 0 && (start[len - 1] == ' ' || start[len - 1] == '\t' || start[len - 1] == '\n' || + start[len - 1] == '\r')) { + len--; + } + if (len >= 2 && (start[0] == '"' || start[0] == '\'' || start[0] == '`') && + start[len - 1] == start[0]) { + start++; + len -= 2; + } + if (len == 0 || len >= buf_sz) { + return NULL; + } + memcpy(buf, start, len); + buf[len] = '\0'; + return buf; +} + +bool cbm_service_pattern_is_http_route_literal(const char *literal, const char *callee_name) { + char path_buf[1024]; + const char *path = strip_string_delimiters(literal, path_buf, sizeof(path_buf)); + if (!path || !path[0]) { + return false; + } + if (strncmp(path, "http://", 7) == 0 || strncmp(path, "https://", 8) == 0) { + return true; + } + if (strstr(path, "://") != NULL) { + return false; + } + if (path[0] != '/') { + return false; + } + if (callee_is_delimiter_or_filesystem_builder(callee_name)) { + return false; + } + if (has_filesystem_root(path) || has_hidden_config_segment(path) || + has_filesystem_extension(path)) { + return false; + } + return true; +} + /* ── Public API ────────────────────────────────────────────────── */ /* Per-worker TLS cache of cbm_service_pattern_match results. diff --git a/src/pipeline/pass_route_nodes.c b/src/pipeline/pass_route_nodes.c index b543aaaa..c6d7d029 100644 --- a/src/pipeline/pass_route_nodes.c +++ b/src/pipeline/pass_route_nodes.c @@ -37,6 +37,8 @@ enum { #include #include +bool cbm_service_pattern_is_http_route_literal(const char *literal, const char *callee_name); + /* Extract a JSON string value by key from properties. * Returns pointer into buf (caller provides buffer). NULL if not found. */ static const char *json_extract(const char *json, const char *key, char *buf, int bufsz) { @@ -84,6 +86,13 @@ static void route_edge_visitor(const cbm_gbuf_edge_t *edge, void *userdata) { if (!url || !url[0]) { return; } + char callee_buf[CBM_SZ_256]; + const char *callee = + json_extract(edge->properties_json, "callee", callee_buf, sizeof(callee_buf)); + if (strcmp(edge->type, "HTTP_CALLS") == 0 && + !cbm_service_pattern_is_http_route_literal(url, callee)) { + return; + } /* Extract method or broker */ char method_buf[CBM_SZ_16]; @@ -573,6 +582,15 @@ typedef struct { const char *edge_type; } caller_edge_ref_t; +static bool http_call_edge_has_valid_route(const cbm_gbuf_edge_t *edge) { + char url_buf[CBM_SZ_512]; + const char *url = json_extract(edge->properties_json, "url_path", url_buf, sizeof(url_buf)); + char callee_buf[CBM_SZ_256]; + const char *callee = + json_extract(edge->properties_json, "callee", callee_buf, sizeof(callee_buf)); + return cbm_service_pattern_is_http_route_literal(url, callee); +} + /* Try to create a DATA_FLOWS edge between caller and handler via a route. * Returns: 1=created, 0=skipped (self/duplicate), -1=skipped (has direct call). */ static int try_create_data_flow(cbm_gbuf_t *gb, int64_t caller_id, int64_t handler_id, @@ -655,6 +673,9 @@ static int collect_caller_edges(cbm_gbuf_t *gb, int64_t route_id, caller_edge_re int http_count = 0; cbm_gbuf_find_edges_by_target_type(gb, route_id, "HTTP_CALLS", &http_edges, &http_count); for (int i = 0; i < http_count && n < max_out; i++) { + if (!http_call_edge_has_valid_route(http_edges[i])) { + continue; + } out[n].source_id = http_edges[i]->source_id; out[n].props = http_edges[i]->properties_json; out[n].edge_type = "HTTP_CALLS"; diff --git a/tests/test_infrascan.c b/tests/test_infrascan.c index 85421b90..a0e45d15 100644 --- a/tests/test_infrascan.c +++ b/tests/test_infrascan.c @@ -1,13 +1,119 @@ -/* - * test_infrascan.c — REMOVED: infrascan tests already exist in test_pipeline.c. - * - * This file is intentionally empty. All infrascan tests are in test_pipeline.c - * (infra_parse_dockerfile_*, infra_parse_dotenv*, infra_parse_shell*, - * infra_parse_terraform*, infra_is_*, infra_clean_json_brackets, - * infra_secret_detection, infra_qn_helper). - */ #include "test_framework.h" +#include "graph_buffer/graph_buffer.h" +#include "pipeline/pipeline_internal.h" + +#include +#include + +bool cbm_service_pattern_is_http_route_literal(const char *literal, const char *callee_name); + +static int has_data_flow(cbm_gbuf_t *gb, int64_t source_id, int64_t target_id) { + const cbm_gbuf_edge_t **edges = NULL; + int count = 0; + cbm_gbuf_find_edges_by_source_type(gb, source_id, "DATA_FLOWS", &edges, &count); + for (int i = 0; i < count; i++) { + if (edges[i]->target_id == target_id) { + return 1; + } + } + return 0; +} + +TEST(infrascan_http_route_literal_guard_rejects_filesystem_paths) { + ASSERT_FALSE(cbm_service_pattern_is_http_route_literal("/etc/crio/crio.conf", "requests.get")); + ASSERT_FALSE( + cbm_service_pattern_is_http_route_literal("/root/.aws/credentials", "requests.get")); + ASSERT_FALSE(cbm_service_pattern_is_http_route_literal("/var/run/app.json", "requests.get")); + ASSERT_FALSE(cbm_service_pattern_is_http_route_literal("/locations/", "str.split")); + ASSERT_FALSE(cbm_service_pattern_is_http_route_literal("/api", "os.path.join")); + ASSERT_FALSE(cbm_service_pattern_is_http_route_literal(NULL, "requests.get")); + ASSERT_FALSE(cbm_service_pattern_is_http_route_literal("", "requests.get")); + ASSERT_TRUE(cbm_service_pattern_is_http_route_literal("/api/orders", "requests.get")); + ASSERT_TRUE(cbm_service_pattern_is_http_route_literal("https://orders.example/api/orders", + "requests.get")); + PASS(); +} + +TEST(infrascan_route_nodes_skip_bad_http_url_paths) { + cbm_gbuf_t *gb = cbm_gbuf_new("test", "/tmp/cbm_infrascan_route_guard"); + ASSERT_NOT_NULL(gb); + int64_t caller = + cbm_gbuf_upsert_node(gb, "Function", "client", "test.client", "client.py", 1, 3, "{}"); + int64_t fs_callee = + cbm_gbuf_upsert_node(gb, "Function", "requests.get", "requests.get", "", 0, 0, "{}"); + int64_t split_callee = + cbm_gbuf_upsert_node(gb, "Function", "str.split", "str.split", "", 0, 0, "{}"); + int64_t empty_callee = + cbm_gbuf_upsert_node(gb, "Function", "requests.post", "requests.post", "", 0, 0, "{}"); + ASSERT_GT(caller, 0); + ASSERT_GT(fs_callee, 0); + ASSERT_GT(split_callee, 0); + ASSERT_GT(empty_callee, 0); + + cbm_gbuf_insert_edge(gb, caller, fs_callee, "HTTP_CALLS", + "{\"callee\":\"requests.get\",\"url_path\":\"/etc/crio/crio.conf\"," + "\"method\":\"GET\"}"); + cbm_gbuf_insert_edge(gb, caller, split_callee, "HTTP_CALLS", + "{\"callee\":\"str.split\",\"url_path\":\"/locations/\"," + "\"method\":\"ANY\"}"); + cbm_gbuf_insert_edge(gb, caller, empty_callee, "HTTP_CALLS", + "{\"callee\":\"requests.get\",\"method\":\"GET\"}"); + + cbm_pipeline_create_route_nodes(gb); + + ASSERT_NULL(cbm_gbuf_find_by_qn(gb, "__route__GET__/etc/crio/crio.conf")); + ASSERT_NULL(cbm_gbuf_find_by_qn(gb, "__route__ANY__/locations/")); + ASSERT_NULL(cbm_gbuf_find_by_qn(gb, "__route__GET__")); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(infrascan_http_calls_join_matching_handler_route) { + cbm_gbuf_t *gb = cbm_gbuf_new("test", "/tmp/cbm_infrascan_route_join"); + ASSERT_NOT_NULL(gb); + + int64_t route = cbm_gbuf_upsert_node(gb, "Route", "/api/orders", "__route__GET__/api/orders", + "", 0, 0, "{\"method\":\"GET\"}"); + int64_t handler = cbm_gbuf_upsert_node(gb, "Function", "get_orders", "test.get_orders", + "server.py", 1, 3, "{}"); + int64_t client = + cbm_gbuf_upsert_node(gb, "Function", "client", "test.client", "client.py", 1, 3, "{}"); + int64_t bad_route = + cbm_gbuf_upsert_node(gb, "Route", "/etc/crio/crio.conf", + "__route__GET__/etc/crio/crio.conf", "", 0, 0, "{\"method\":\"GET\"}"); + int64_t bad_handler = cbm_gbuf_upsert_node(gb, "Function", "bad_handler", "test.bad_handler", + "server.py", 5, 7, "{}"); + int64_t bad_client = cbm_gbuf_upsert_node(gb, "Function", "bad_client", "test.bad_client", + "client.py", 5, 7, "{}"); + ASSERT_GT(route, 0); + ASSERT_GT(handler, 0); + ASSERT_GT(client, 0); + ASSERT_GT(bad_route, 0); + ASSERT_GT(bad_handler, 0); + ASSERT_GT(bad_client, 0); + + cbm_gbuf_insert_edge(gb, handler, route, "HANDLES", "{\"handler\":\"test.get_orders\"}"); + cbm_gbuf_insert_edge(gb, client, route, "HTTP_CALLS", + "{\"callee\":\"requests.get\",\"url_path\":\"/api/orders\"," + "\"method\":\"GET\"}"); + cbm_gbuf_insert_edge(gb, bad_handler, bad_route, "HANDLES", + "{\"handler\":\"test.bad_handler\"}"); + cbm_gbuf_insert_edge(gb, bad_client, bad_route, "HTTP_CALLS", + "{\"callee\":\"requests.get\",\"url_path\":\"/etc/crio/crio.conf\"," + "\"method\":\"GET\"}"); + + cbm_pipeline_create_route_nodes(gb); + + ASSERT_TRUE(has_data_flow(gb, client, handler)); + ASSERT_FALSE(has_data_flow(gb, bad_client, bad_handler)); + + cbm_gbuf_free(gb); + PASS(); +} SUITE(infrascan) { - /* All infrascan tests live in test_pipeline.c's pipeline suite */ + RUN_TEST(infrascan_http_route_literal_guard_rejects_filesystem_paths); + RUN_TEST(infrascan_route_nodes_skip_bad_http_url_paths); + RUN_TEST(infrascan_http_calls_join_matching_handler_route); }