remove relationships with phantom entities (microsoft#2261)

dayesouza · web-flow · commit bfd42c1ecd2c · 2026-03-03T10:18:34.000-03:00
* filter phantom relationships

* fix flush size
diff --git a/.semversioner/next-release/patch-20260302221432185149.json b/.semversioner/next-release/patch-20260302221432185149.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "filter phantom relationships in graph"
+}
diff --git a/packages/graphrag/graphrag/index/operations/embed_text/embed_text.py b/packages/graphrag/graphrag/index/operations/embed_text/embed_text.py
@@ -33,12 +33,18 @@ async def embed_text(
     id_column: str = "id",
     output_table: Table | None = None,
 ) -> int:
-    """Embed text from a streaming Table into a vector store."""
+    """Embed text from a streaming Table into a vector store.
+
+    Rows are buffered before flushing to ``run_embed_text``,
+    which dispatches API batches concurrently up to
+    ``num_threads``.  The buffer is sized so each flush produces
+    enough batches to saturate the concurrency limit.
+    """
     vector_store.create_index()
 
     buffer: list[dict[str, Any]] = []
     total_rows = 0
-    flush_size = batch_size * 4
+    flush_size = batch_size * num_threads
 
     async for row in input_table:
         text = row.get(embed_column)
diff --git a/packages/graphrag/graphrag/index/operations/extract_graph/extract_graph.py b/packages/graphrag/graphrag/index/operations/extract_graph/extract_graph.py
@@ -11,6 +11,7 @@
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.config.enums import AsyncType
 from graphrag.index.operations.extract_graph.graph_extractor import GraphExtractor
+from graphrag.index.operations.extract_graph.utils import filter_orphan_relationships
 from graphrag.index.utils.derive_from_rows import derive_from_rows
 
 if TYPE_CHECKING:
@@ -67,6 +68,7 @@ async def run_strategy(row):
 
     entities = _merge_entities(entity_dfs)
     relationships = _merge_relationships(relationship_dfs)
+    relationships = filter_orphan_relationships(relationships, entities)
 
     return (entities, relationships)
 
diff --git a/packages/graphrag/graphrag/index/operations/extract_graph/utils.py b/packages/graphrag/graphrag/index/operations/extract_graph/utils.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2026 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Utility functions for graph extraction operations."""
+
+import logging
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+def filter_orphan_relationships(
+    relationships: pd.DataFrame,
+    entities: pd.DataFrame,
+) -> pd.DataFrame:
+    """Remove relationships whose source or target has no entity entry.
+
+    After LLM graph extraction, the model may hallucinate entity
+    names in relationships that have no corresponding entity row.
+    This function drops those dangling references so downstream
+    processing never encounters broken graph edges.
+
+    Parameters
+    ----------
+    relationships:
+        Merged relationship DataFrame with at least ``source``
+        and ``target`` columns.
+    entities:
+        Merged entity DataFrame with at least a ``title`` column.
+
+    Returns
+    -------
+    pd.DataFrame
+        Relationships filtered to only those whose ``source``
+        and ``target`` both appear in ``entities["title"]``.
+    """
+    if relationships.empty or entities.empty:
+        return relationships.iloc[0:0].reset_index(drop=True)
+
+    entity_titles = set(entities["title"])
+    before_count = len(relationships)
+    mask = relationships["source"].isin(entity_titles) & relationships["target"].isin(
+        entity_titles
+    )
+    filtered = relationships[mask].reset_index(drop=True)
+    dropped = before_count - len(filtered)
+    if dropped > 0:
+        logger.warning(
+            "Dropped %d relationship(s) referencing non-existent entities.",
+            dropped,
+        )
+    return filtered
diff --git a/packages/graphrag/graphrag/index/workflows/update_entities_relationships.py b/packages/graphrag/graphrag/index/workflows/update_entities_relationships.py
@@ -14,6 +14,9 @@
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.data_model.data_reader import DataReader
+from graphrag.index.operations.extract_graph.utils import (
+    filter_orphan_relationships,
+)
 from graphrag.index.run.utils import get_update_table_providers
 from graphrag.index.typing.context import PipelineRunContext
 from graphrag.index.typing.workflow import WorkflowFunctionOutput
@@ -79,6 +82,10 @@ async def _update_entities_and_relationships(
         delta_relationships,
     )
 
+    merged_relationships_df = filter_orphan_relationships(
+        merged_relationships_df, merged_entities_df
+    )
+
     summarization_model_config = config.get_completion_model_config(
         config.summarize_descriptions.completion_model_id
     )
diff --git a/tests/unit/indexing/operations/embed_text/test_embed_text.py b/tests/unit/indexing/operations/embed_text/test_embed_text.py
@@ -150,7 +150,13 @@ async def test_embed_text_basic():
 
 @pytest.mark.asyncio
 async def test_embed_text_batching():
-    """Verify rows are flushed in batches when buffer exceeds batch_size * 4."""
+    """Verify rows are flushed in batches sized by batch_size * num_threads.
+
+    With batch_size=2 and num_threads=4, each flush holds up to
+    8 rows (enough to produce 4 API batches that saturate the
+    concurrency limit).  10 rows should produce 2 flushes:
+    one of 8 rows and a final remainder of 2.
+    """
     rows = [{"id": str(i), "text": f"text {i}"} for i in range(10)]
     input_table = FakeInputTable(rows)
     vector_store = _make_mock_vector_store()
@@ -172,7 +178,7 @@ async def test_embed_text_batching():
             embed_column="text",
             batch_size=2,
             batch_max_tokens=8191,
-            num_threads=1,
+            num_threads=4,
             vector_store=vector_store,
         )
 
diff --git a/tests/unit/indexing/operations/test_extract_graph.py b/tests/unit/indexing/operations/test_extract_graph.py
diff --git a/tests/unit/indexing/update/__init__.py b/tests/unit/indexing/update/__init__.py
diff --git a/tests/unit/indexing/update/test_update_relationships.py b/tests/unit/indexing/update/test_update_relationships.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "filter phantom relationships in graph"
 +}