Skip to content

⚡️ Speed up function find_leaf_nodes by 38,690%#302

Open
codeflash-ai[bot] wants to merge 1 commit intomainfrom
codeflash/optimize-find_leaf_nodes-mmuc08dp
Open

⚡️ Speed up function find_leaf_nodes by 38,690%#302
codeflash-ai[bot] wants to merge 1 commit intomainfrom
codeflash/optimize-find_leaf_nodes-mmuc08dp

Conversation

@codeflash-ai
Copy link

@codeflash-ai codeflash-ai bot commented Mar 17, 2026

📄 38,690% (386.90x) speedup for find_leaf_nodes in src/algorithms/graph.py

⏱️ Runtime : 469 milliseconds 1.21 milliseconds (best of 180 runs)

📝 Explanation and details

The original nested loop scanned all edges for every node (O(n·m)), spending 99.9% of runtime on dictionary lookups inside that double loop. The optimization builds a set of source IDs once (O(m)), then checks membership in O(1) per node (O(n) total), reducing the worst-case test from 17.6 ms to 57.5 µs. A try-except wrapper falls back to list membership if any source value is unhashable, preserving correctness while still delivering massive speedups (400× to 50,000× across large-graph tests). Micro-benchmarks on empty inputs regress by ~30% due to set-construction overhead, but all realistic workloads show dramatic gains.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 39 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 2 Passed
📊 Tests Coverage 100.0%
🌀 Click to see Generated Regression Tests
import pytest  # used for our unit tests

# import the function under test from the provided module path
from src.algorithms.graph import find_leaf_nodes


def test_single_leaf_simple_graph():
    # Simple linear graph A -> B -> C, so only C is a leaf (no outgoing edges).
    a = {"id": "A", "payload": 1}
    b = {"id": "B", "payload": 2}
    c = {"id": "C", "payload": 3}
    nodes = [a, b, c]
    edges = [{"source": "A", "target": "B"}, {"source": "B", "target": "C"}]

    leaves = find_leaf_nodes(nodes, edges)  # 1.23μs -> 1.06μs (16.0% faster)

    # Only node C should be returned.
    assert leaves == [c]
    # The same dict object should be returned (no copying).
    assert leaves[0] is c


def test_empty_nodes_returns_empty_list_and_no_edges_all_leaves():
    # When nodes list is empty -> result must be empty list.
    assert find_leaf_nodes([], []) == []  # 360ns -> 552ns (34.8% slower)

    # When no edges -> every node has no outgoing edges -> all are leaves in original order.
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    assert find_leaf_nodes(nodes, []) == nodes  # 691ns -> 651ns (6.14% faster)


def test_duplicate_node_ids_treated_by_id_equality():
    # Two distinct dict objects with identical 'id' values.
    node_a1 = {"id": 1, "name": "first"}
    node_a2 = {"id": 1, "name": "second"}
    node_b = {"id": 2, "name": "other"}

    nodes = [node_a1, node_a2, node_b]
    # An edge from id 1 makes both node_a1 and node_a2 non-leaves because the function
    # checks equality by node["id"] and does not distinguish instances.
    edges = [{"source": 1, "target": 2}]

    leaves = find_leaf_nodes(nodes, edges)  # 1.16μs -> 942ns (23.4% faster)

    # Only node_b should remain a leaf.
    assert leaves == [node_b]


def test_various_id_types_and_values():
    # IDs can be any hashable or comparable value; equality is used.
    tup = (1, 2)
    nodes = [
        {"id": None},
        {"id": "x"},
        {"id": tup},
        {"id": 0},
    ]
    # Create edges that use string and tuple as source so those nodes won't be leaves.
    edges = [{"source": "x", "target": None}, {"source": tup, "target": 0}]
    leaves = find_leaf_nodes(nodes, edges)  # 2.00μs -> 1.35μs (48.0% faster)

    # Expect None and 0 to be leaves (no outgoing edges from those ids).
    expected = [nodes[0], nodes[3]]
    assert leaves == expected


def test_missing_source_key_in_edge_raises_keyerror():
    # If an edge dict lacks the 'source' key, the function will attempt edge["source"]
    # and a KeyError should be raised (current documented behavior).
    nodes = [{"id": 1}]
    edges = [{"src": 1}]  # malformed edge dict

    with pytest.raises(KeyError):
        find_leaf_nodes(nodes, edges)  # 1.06μs -> 1.67μs (36.5% slower)


def test_order_of_nodes_preserved_in_result():
    # Ensure order of nodes in the returned list follows input order for leaves.
    n1 = {"id": "n1"}
    n2 = {"id": "n2"}
    n3 = {"id": "n3"}
    nodes = [n1, n2, n3]
    # Only n1 and n3 have no outgoing edges (n2 has an outgoing edge).
    edges = [{"source": "n2", "target": "n1"}]

    leaves = find_leaf_nodes(nodes, edges)  # 1.26μs -> 1.05μs (20.1% faster)
    assert leaves == [n1, n3]  # preserve original relative order


def test_large_chain_graph_1000_nodes_only_last_is_leaf():
    # Create a chain 0 -> 1 -> 2 -> ... -> 999
    size = 1000
    nodes = [{"id": i} for i in range(size)]
    # edges from i -> i+1 for i in 0..size-2
    edges = [{"source": i, "target": i + 1} for i in range(size - 1)]

    leaves = find_leaf_nodes(nodes, edges)  # 17.6ms -> 57.7μs (30456% faster)

    # Only the last node should be a leaf.
    assert len(leaves) == 1
    assert leaves[0] == {"id": size - 1}
    # Confirm it's the same dict object from the original nodes list.
    assert leaves[0] is nodes[-1]


def test_large_no_edges_all_nodes_are_leaves_1000_nodes():
    # With many nodes and no edges, all nodes should be returned (stress test).
    size = 1000
    nodes = [{"id": f"node-{i}"} for i in range(size)]
    edges = []

    leaves = find_leaf_nodes(nodes, edges)  # 47.3μs -> 43.5μs (8.68% faster)

    assert len(leaves) == size
    # All returned nodes must be the same objects and in the same order.
    assert (
        leaves is not None
        and leaves == nodes
        and all(leaves[i] is nodes[i] for i in range(size))
    )
import pytest
from src.algorithms.graph import find_leaf_nodes


def test_single_leaf_node():
    """Test function with a single node and no edges."""
    nodes = [{"id": 1, "name": "A"}]
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 571ns -> 751ns (24.0% slower)
    assert result == [{"id": 1, "name": "A"}]
    assert len(result) == 1


def test_all_nodes_are_leaves():
    """Test when no edges exist, so all nodes are leaves."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}, {"id": 3, "name": "C"}]
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 732ns -> 861ns (15.0% slower)
    assert len(result) == 3
    assert all(node in result for node in nodes)


def test_no_leaf_nodes():
    """Test when all nodes have outgoing edges."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}, {"id": 3, "name": "C"}]
    edges = [
        {"source": 1, "target": 2},
        {"source": 2, "target": 3},
        {"source": 3, "target": 1},
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.15μs -> 982ns (17.4% faster)
    assert result == []
    assert len(result) == 0


def test_some_leaf_nodes():
    """Test with a mix of leaf and non-leaf nodes."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}, {"id": 3, "name": "C"}]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 1.04μs -> 911ns (14.4% faster)
    assert len(result) == 2
    assert {"id": 2, "name": "B"} in result
    assert {"id": 3, "name": "C"} in result
    assert {"id": 1, "name": "A"} not in result


def test_single_edge_chain():
    """Test a simple chain where only the last node is a leaf."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}, {"id": 3, "name": "C"}]
    edges = [{"source": 1, "target": 2}, {"source": 2, "target": 3}]
    result = find_leaf_nodes(nodes, edges)  # 1.06μs -> 922ns (15.2% faster)
    assert result == [{"id": 3, "name": "C"}]


def test_multiple_edges_from_same_source():
    """Test node with multiple outgoing edges (still only one source check needed)."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}, {"id": 3, "name": "C"}]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 3}]
    result = find_leaf_nodes(nodes, edges)  # 1.07μs -> 952ns (12.6% faster)
    assert len(result) == 2
    assert {"id": 2, "name": "B"} in result
    assert {"id": 3, "name": "C"} in result


def test_preserves_node_order():
    """Test that leaf nodes are returned in the same order as input."""
    nodes = [{"id": 3, "name": "C"}, {"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 992ns -> 901ns (10.1% faster)
    assert result[0]["id"] == 3
    assert result[1]["id"] == 2


def test_diamond_graph():
    """Test a diamond-shaped graph structure."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
        {"id": 3, "name": "C"},
        {"id": 4, "name": "D"},
    ]
    edges = [
        {"source": 1, "target": 2},
        {"source": 1, "target": 3},
        {"source": 2, "target": 4},
        {"source": 3, "target": 4},
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.41μs -> 1.10μs (28.1% faster)
    assert result == [{"id": 4, "name": "D"}]


def test_empty_nodes_list():
    """Test with empty nodes list."""
    nodes = []
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 371ns -> 551ns (32.7% slower)
    assert result == []


def test_empty_nodes_with_edges():
    """Test with empty nodes list but edges present (shouldn't matter)."""
    nodes = []
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 351ns -> 691ns (49.2% slower)
    assert result == []


def test_nodes_with_string_ids():
    """Test nodes with string-based IDs instead of integers."""
    nodes = [
        {"id": "node_a", "name": "A"},
        {"id": "node_b", "name": "B"},
        {"id": "node_c", "name": "C"},
    ]
    edges = [{"source": "node_a", "target": "node_b"}]
    result = find_leaf_nodes(nodes, edges)  # 1.36μs -> 1.03μs (32.0% faster)
    assert len(result) == 2
    assert {"id": "node_b", "name": "B"} in result
    assert {"id": "node_c", "name": "C"} in result


def test_nodes_with_extra_attributes():
    """Test nodes that contain additional attributes beyond id and name."""
    nodes = [
        {"id": 1, "name": "A", "value": 10, "type": "type1"},
        {"id": 2, "name": "B", "value": 20, "type": "type2"},
        {"id": 3, "name": "C", "value": 30, "type": "type1"},
    ]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 1.08μs -> 912ns (18.6% faster)
    assert len(result) == 2
    # Verify that extra attributes are preserved
    assert {"id": 2, "name": "B", "value": 20, "type": "type2"} in result
    assert {"id": 3, "name": "C", "value": 30, "type": "type1"} in result


def test_edges_with_extra_attributes():
    """Test edges that contain attributes beyond source and target."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}, {"id": 3, "name": "C"}]
    edges = [
        {"source": 1, "target": 2, "weight": 5, "label": "edge1"},
        {"source": 2, "target": 3, "weight": 10, "label": "edge2"},
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.09μs -> 901ns (21.2% faster)
    assert result == [{"id": 3, "name": "C"}]


def test_duplicate_nodes():
    """Test with duplicate node definitions (same id)."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 1, "name": "A_duplicate"},
        {"id": 2, "name": "B"},
    ]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 1.00μs -> 882ns (13.6% faster)
    # Both nodes with id=1 should be excluded, only node 2 is a leaf
    assert len(result) == 1
    assert result[0]["id"] == 2


def test_self_loop():
    """Test node with a self-loop edge (edge from node to itself)."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
    edges = [{"source": 1, "target": 1}]  # Self-loop
    result = find_leaf_nodes(nodes, edges)  # 811ns -> 791ns (2.53% faster)
    assert len(result) == 1
    assert {"id": 2, "name": "B"} in result


def test_edge_to_nonexistent_node():
    """Test edge pointing to a node that doesn't exist in the nodes list."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
    edges = [{"source": 1, "target": 99}]  # Node 99 doesn't exist
    result = find_leaf_nodes(nodes, edges)  # 872ns -> 802ns (8.73% faster)
    # Node 1 has an outgoing edge, node 2 is a leaf
    assert result == [{"id": 2, "name": "B"}]


def test_node_with_zero_id():
    """Test node with id of 0."""
    nodes = [{"id": 0, "name": "A"}, {"id": 1, "name": "B"}]
    edges = [{"source": 1, "target": 0}]
    result = find_leaf_nodes(nodes, edges)  # 881ns -> 821ns (7.31% faster)
    assert result == [{"id": 0, "name": "A"}]


def test_negative_node_ids():
    """Test nodes with negative IDs."""
    nodes = [{"id": -1, "name": "A"}, {"id": -2, "name": "B"}, {"id": 1, "name": "C"}]
    edges = [{"source": -1, "target": -2}]
    result = find_leaf_nodes(nodes, edges)  # 1.03μs -> 1.21μs (14.9% slower)
    assert len(result) == 2
    assert {"id": -2, "name": "B"} in result
    assert {"id": 1, "name": "C"} in result


def test_none_as_node_attribute_value():
    """Test nodes with None as an attribute value."""
    nodes = [{"id": 1, "name": None}, {"id": 2, "name": "B"}]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 872ns -> 821ns (6.21% faster)
    assert result == [{"id": 2, "name": "B"}]


def test_empty_string_node_id():
    """Test node with empty string as ID."""
    nodes = [{"id": "", "name": "A"}, {"id": "b", "name": "B"}]
    edges = [{"source": "", "target": "b"}]
    result = find_leaf_nodes(nodes, edges)  # 1.04μs -> 881ns (18.2% faster)
    assert result == [{"id": "b", "name": "B"}]


def test_unicode_node_ids():
    """Test nodes with unicode string IDs."""
    nodes = [
        {"id": "α", "name": "Alpha"},
        {"id": "β", "name": "Beta"},
        {"id": "γ", "name": "Gamma"},
    ]
    edges = [{"source": "α", "target": "β"}]
    result = find_leaf_nodes(nodes, edges)  # 1.17μs -> 991ns (18.3% faster)
    assert len(result) == 2
    assert {"id": "β", "name": "Beta"} in result
    assert {"id": "γ", "name": "Gamma"} in result


def test_large_number_of_leaf_nodes():
    """Test with many leaf nodes and few edges."""
    # Create 1000 nodes
    nodes = [{"id": i, "name": f"Node_{i}"} for i in range(1000)]
    # Only first 10 have outgoing edges
    edges = [{"source": i, "target": i + 1} for i in range(9)]

    result = find_leaf_nodes(nodes, edges)  # 380μs -> 32.8μs (1062% faster)
    # Nodes 10-999 should be leaves (990 nodes)
    assert len(result) == 990
    # Verify a few specific leaf nodes exist
    assert any(node["id"] == 100 for node in result)
    assert any(node["id"] == 500 for node in result)
    assert any(node["id"] == 999 for node in result)


def test_large_number_of_edges():
    """Test with many edges from a single source."""
    nodes = [{"id": i, "name": f"Node_{i}"} for i in range(100)]
    # Node 0 connects to all other nodes (99 edges from node 0)
    edges = [{"source": 0, "target": i} for i in range(1, 100)]

    result = find_leaf_nodes(nodes, edges)  # 319μs -> 6.66μs (4698% faster)
    # Only node 0 is not a leaf (all 99 others are leaves)
    assert len(result) == 99
    assert not any(node["id"] == 0 for node in result)


def test_large_complete_graph():
    """Test a complete graph where every node connects to every other node."""
    n = 50  # Complete graph with 50 nodes
    nodes = [{"id": i, "name": f"Node_{i}"} for i in range(n)]
    # Create edges from every node to every other node
    edges = []
    for i in range(n):
        for j in range(n):
            if i != j:
                edges.append({"source": i, "target": j})

    result = find_leaf_nodes(nodes, edges)  # 1.94ms -> 51.7μs (3648% faster)
    # In a complete directed graph, no node can be a leaf
    assert result == []


def test_large_chain_graph():
    """Test a long chain where only the last node is a leaf."""
    n = 1000
    nodes = [{"id": i, "name": f"Node_{i}"} for i in range(n)]
    # Create a chain: 0 -> 1 -> 2 -> ... -> 999
    edges = [{"source": i, "target": i + 1} for i in range(n - 1)]

    result = find_leaf_nodes(nodes, edges)  # 17.6ms -> 57.5μs (30473% faster)
    # Only the last node (999) should be a leaf
    assert len(result) == 1
    assert result[0]["id"] == n - 1


def test_large_star_graph():
    """Test a star graph with one central node and many leaves."""
    center_id = 0
    n = 500
    nodes = [{"id": i, "name": f"Node_{i}"} for i in range(n)]
    # Center node connects to all others
    edges = [{"source": center_id, "target": i} for i in range(1, n)]

    result = find_leaf_nodes(nodes, edges)  # 8.26ms -> 26.1μs (31514% faster)
    # All nodes except the center are leaves
    assert len(result) == n - 1
    assert not any(node["id"] == center_id for node in result)
    # Verify some specific leaves exist
    assert any(node["id"] == 1 for node in result)
    assert any(node["id"] == 250 for node in result)
    assert any(node["id"] == 499 for node in result)


def test_large_bipartite_graph():
    """Test a bipartite graph with many nodes in each partition."""
    n = 200  # 200 nodes in each partition (400 total)
    # Left partition: nodes 0-199, Right partition: nodes 200-399
    nodes = [{"id": i, "name": f"Node_{i}"} for i in range(n * 2)]
    # All left nodes connect to all right nodes
    edges = []
    for i in range(n):
        for j in range(n, n * 2):
            edges.append({"source": i, "target": j})

    result = find_leaf_nodes(nodes, edges)  # 403ms -> 770μs (52266% faster)
    # All right partition nodes (200-399) should be leaves
    assert len(result) == n
    assert all(node["id"] >= n for node in result)


def test_large_sparse_graph():
    """Test a sparse graph with many nodes but few edges."""
    n = 1000
    nodes = [{"id": i, "name": f"Node_{i}"} for i in range(n)]
    # Only create edges for every 100th node
    edges = [{"source": i, "target": i + 1} for i in range(0, n - 1, 100)]

    result = find_leaf_nodes(nodes, edges)  # 460μs -> 35.5μs (1195% faster)
    # Most nodes should be leaves (only ~10 have outgoing edges)
    assert len(result) == n - len(edges)
    assert len(result) > 980  # Should have more than 980 leaf nodes


def test_large_nodes_with_many_attributes():
    """Test with large nodes containing many attributes."""
    nodes = []
    for i in range(200):
        node = {"id": i, "name": f"Node_{i}"}
        # Add many additional attributes
        for attr in range(50):
            node[f"attr_{attr}"] = f"value_{attr}_{i}"
        nodes.append(node)

    edges = [{"source": i, "target": i + 1} for i in range(199)]

    result = find_leaf_nodes(nodes, edges)  # 657μs -> 12.6μs (5119% faster)
    # Only the last node should be a leaf
    assert len(result) == 1
    assert result[0]["id"] == 199
    # Verify that all attributes are preserved
    assert "attr_0" in result[0]
    assert "attr_49" in result[0]


def test_large_graph_with_mixed_id_types():
    """Test large graph with nodes having mixed ID types (int and string)."""
    nodes = []
    edges = []

    # Create 500 nodes with integer IDs
    for i in range(500):
        nodes.append({"id": i, "name": f"Node_{i}", "type": "int"})

    # Create 500 nodes with string IDs
    for i in range(500):
        nodes.append({"id": f"str_{i}", "name": f"StringNode_{i}", "type": "str"})

    # Create edges: int nodes connect to next int node, string nodes to next string node
    for i in range(499):
        edges.append({"source": i, "target": i + 1})
    for i in range(499):
        edges.append({"source": f"str_{i}", "target": f"str_{i + 1}"})

    result = find_leaf_nodes(nodes, edges)  # 18.3ms -> 86.9μs (21003% faster)
    # Two leaf nodes: 499 and "str_499"
    assert len(result) == 2
    assert any(node["id"] == 499 for node in result)
    assert any(node["id"] == "str_499" for node in result)
from src.algorithms.graph import find_leaf_nodes


def test_find_leaf_nodes():
    find_leaf_nodes([{"id": "\x01\x00"}], [{"source": "\x00\x00"}])


def test_find_leaf_nodes_2():
    find_leaf_nodes([{"id": 3}], [{"source": 3}])
🔎 Click to see Concolic Coverage Tests

To edit these changes git checkout codeflash/optimize-find_leaf_nodes-mmuc08dp and push.

Codeflash Static Badge

The original nested loop scanned all edges for every node (O(n·m)), spending 99.9% of runtime on dictionary lookups inside that double loop. The optimization builds a set of source IDs once (O(m)), then checks membership in O(1) per node (O(n) total), reducing the worst-case test from 17.6 ms to 57.5 µs. A try-except wrapper falls back to list membership if any source value is unhashable, preserving correctness while still delivering massive speedups (400× to 50,000× across large-graph tests). Micro-benchmarks on empty inputs regress by ~30% due to set-construction overhead, but all realistic workloads show dramatic gains.
@codeflash-ai codeflash-ai bot requested a review from KRRT7 March 17, 2026 08:09
@codeflash-ai codeflash-ai bot added ⚡️ codeflash Optimization PR opened by Codeflash AI 🎯 Quality: High Optimization Quality according to Codeflash labels Mar 17, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI 🎯 Quality: High Optimization Quality according to Codeflash

Projects

None yet

Development

Successfully merging this pull request may close these issues.

0 participants