Skip to content

Commit bfd42c1

Browse files
authored
remove relationships with phantom entities (microsoft#2261)
* filter phantom relationships * fix flush size
1 parent 1ceb00c commit bfd42c1

9 files changed

Lines changed: 612 additions & 4 deletions

File tree

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "filter phantom relationships in graph"
4+
}

packages/graphrag/graphrag/index/operations/embed_text/embed_text.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,18 @@ async def embed_text(
3333
id_column: str = "id",
3434
output_table: Table | None = None,
3535
) -> int:
36-
"""Embed text from a streaming Table into a vector store."""
36+
"""Embed text from a streaming Table into a vector store.
37+
38+
Rows are buffered before flushing to ``run_embed_text``,
39+
which dispatches API batches concurrently up to
40+
``num_threads``. The buffer is sized so each flush produces
41+
enough batches to saturate the concurrency limit.
42+
"""
3743
vector_store.create_index()
3844

3945
buffer: list[dict[str, Any]] = []
4046
total_rows = 0
41-
flush_size = batch_size * 4
47+
flush_size = batch_size * num_threads
4248

4349
async for row in input_table:
4450
text = row.get(embed_column)

packages/graphrag/graphrag/index/operations/extract_graph/extract_graph.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1212
from graphrag.config.enums import AsyncType
1313
from graphrag.index.operations.extract_graph.graph_extractor import GraphExtractor
14+
from graphrag.index.operations.extract_graph.utils import filter_orphan_relationships
1415
from graphrag.index.utils.derive_from_rows import derive_from_rows
1516

1617
if TYPE_CHECKING:
@@ -67,6 +68,7 @@ async def run_strategy(row):
6768

6869
entities = _merge_entities(entity_dfs)
6970
relationships = _merge_relationships(relationship_dfs)
71+
relationships = filter_orphan_relationships(relationships, entities)
7072

7173
return (entities, relationships)
7274

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright (C) 2026 Microsoft Corporation.
2+
# Licensed under the MIT License
3+
4+
"""Utility functions for graph extraction operations."""
5+
6+
import logging
7+
8+
import pandas as pd
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def filter_orphan_relationships(
14+
relationships: pd.DataFrame,
15+
entities: pd.DataFrame,
16+
) -> pd.DataFrame:
17+
"""Remove relationships whose source or target has no entity entry.
18+
19+
After LLM graph extraction, the model may hallucinate entity
20+
names in relationships that have no corresponding entity row.
21+
This function drops those dangling references so downstream
22+
processing never encounters broken graph edges.
23+
24+
Parameters
25+
----------
26+
relationships:
27+
Merged relationship DataFrame with at least ``source``
28+
and ``target`` columns.
29+
entities:
30+
Merged entity DataFrame with at least a ``title`` column.
31+
32+
Returns
33+
-------
34+
pd.DataFrame
35+
Relationships filtered to only those whose ``source``
36+
and ``target`` both appear in ``entities["title"]``.
37+
"""
38+
if relationships.empty or entities.empty:
39+
return relationships.iloc[0:0].reset_index(drop=True)
40+
41+
entity_titles = set(entities["title"])
42+
before_count = len(relationships)
43+
mask = relationships["source"].isin(entity_titles) & relationships["target"].isin(
44+
entity_titles
45+
)
46+
filtered = relationships[mask].reset_index(drop=True)
47+
dropped = before_count - len(filtered)
48+
if dropped > 0:
49+
logger.warning(
50+
"Dropped %d relationship(s) referencing non-existent entities.",
51+
dropped,
52+
)
53+
return filtered

packages/graphrag/graphrag/index/workflows/update_entities_relationships.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1515
from graphrag.config.models.graph_rag_config import GraphRagConfig
1616
from graphrag.data_model.data_reader import DataReader
17+
from graphrag.index.operations.extract_graph.utils import (
18+
filter_orphan_relationships,
19+
)
1720
from graphrag.index.run.utils import get_update_table_providers
1821
from graphrag.index.typing.context import PipelineRunContext
1922
from graphrag.index.typing.workflow import WorkflowFunctionOutput
@@ -79,6 +82,10 @@ async def _update_entities_and_relationships(
7982
delta_relationships,
8083
)
8184

85+
merged_relationships_df = filter_orphan_relationships(
86+
merged_relationships_df, merged_entities_df
87+
)
88+
8289
summarization_model_config = config.get_completion_model_config(
8390
config.summarize_descriptions.completion_model_id
8491
)

tests/unit/indexing/operations/embed_text/test_embed_text.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,13 @@ async def test_embed_text_basic():
150150

151151
@pytest.mark.asyncio
152152
async def test_embed_text_batching():
153-
"""Verify rows are flushed in batches when buffer exceeds batch_size * 4."""
153+
"""Verify rows are flushed in batches sized by batch_size * num_threads.
154+
155+
With batch_size=2 and num_threads=4, each flush holds up to
156+
8 rows (enough to produce 4 API batches that saturate the
157+
concurrency limit). 10 rows should produce 2 flushes:
158+
one of 8 rows and a final remainder of 2.
159+
"""
154160
rows = [{"id": str(i), "text": f"text {i}"} for i in range(10)]
155161
input_table = FakeInputTable(rows)
156162
vector_store = _make_mock_vector_store()
@@ -172,7 +178,7 @@ async def test_embed_text_batching():
172178
embed_column="text",
173179
batch_size=2,
174180
batch_max_tokens=8191,
175-
num_threads=1,
181+
num_threads=4,
176182
vector_store=vector_store,
177183
)
178184

0 commit comments

Comments
 (0)