From 740b12153d5c491b7f695d605f2be3f5e1ff70ab Mon Sep 17 00:00:00 2001 From: bsesic Date: Fri, 22 May 2026 14:13:17 +0200 Subject: [PATCH 01/27] chore(multi): scaffold the multi-witness subpackage Adds empty src/tracealign/multi/ and tests/multi/ packages. Subsequent tasks fill in distance, guide_tree, graph, table, merge, and api modules per docs/superpowers/specs/2026-05-21-trace-v0.2-multi-witness-design.md. --- src/tracealign/multi/__init__.py | 1 + tests/multi/__init__.py | 0 2 files changed, 1 insertion(+) create mode 100644 src/tracealign/multi/__init__.py create mode 100644 tests/multi/__init__.py diff --git a/src/tracealign/multi/__init__.py b/src/tracealign/multi/__init__.py new file mode 100644 index 0000000..4c37647 --- /dev/null +++ b/src/tracealign/multi/__init__.py @@ -0,0 +1 @@ +"""Multi-witness alignment subpackage (v0.2).""" diff --git a/tests/multi/__init__.py b/tests/multi/__init__.py new file mode 100644 index 0000000..e69de29 From 304db3c3c64a965a857c5ae78149dacced9c9058 Mon Sep 17 00:00:00 2001 From: bsesic Date: Fri, 22 May 2026 20:43:20 +0200 Subject: [PATCH 02/27] feat(multi): add GraphNode and GraphEdge types GraphNode carries the witness_id -> Token mapping at one aligned position. GraphEdge carries the set of witnesses that traverse the directed edge. Both are Pydantic models with extra='forbid' for typo safety. --- src/tracealign/multi/graph.py | 30 +++++++++++++++++++++ tests/multi/test_graph_nodes.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 src/tracealign/multi/graph.py create mode 100644 tests/multi/test_graph_nodes.py diff --git a/src/tracealign/multi/graph.py b/src/tracealign/multi/graph.py new file mode 100644 index 0000000..cb5018a --- /dev/null +++ b/src/tracealign/multi/graph.py @@ -0,0 +1,30 @@ +"""Variant graph types and helpers.""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict + +from tracealign.model import Token + + +class GraphNode(BaseModel): + """A position in the multi-witness alignment. + + A node carries the tokens from witnesses that are considered aligned at + this position. Nodes with zero tokens are the START and END sentinels. + """ + + model_config = ConfigDict(extra="forbid") + + id: str + tokens: dict[str, Token] + + +class GraphEdge(BaseModel): + """A directed edge in the variant DAG carrying the witnesses that traverse it.""" + + model_config = ConfigDict(extra="forbid") + + source_id: str + target_id: str + witnesses: set[str] diff --git a/tests/multi/test_graph_nodes.py b/tests/multi/test_graph_nodes.py new file mode 100644 index 0000000..9296d1f --- /dev/null +++ b/tests/multi/test_graph_nodes.py @@ -0,0 +1,48 @@ +"""Tests for GraphNode and GraphEdge.""" + +import pytest +from pydantic import ValidationError + +from tracealign.model import Token +from tracealign.multi.graph import GraphNode, GraphEdge + + +def _tok(text: str, position: int = 0) -> Token: + return Token( + id=f"x:{position:06d}", + position=position, + raw=text, + text=text, + ) + + +def test_graph_node_with_two_witnesses(): + node = GraphNode( + id="n:000001", + tokens={"W1": _tok("רבי"), "W2": _tok("רבי")}, + ) + assert node.id == "n:000001" + assert set(node.tokens.keys()) == {"W1", "W2"} + + +def test_graph_node_empty_tokens_for_sentinel(): + # START / END sentinels carry no tokens + node = GraphNode(id="START", tokens={}) + assert node.tokens == {} + + +def test_graph_edge_with_witness_set(): + edge = GraphEdge(source_id="n:0", target_id="n:1", witnesses={"W1", "W2"}) + assert edge.source_id == "n:0" + assert edge.target_id == "n:1" + assert edge.witnesses == {"W1", "W2"} + + +def test_graph_node_rejects_extra_fields(): + with pytest.raises(ValidationError): + GraphNode(id="n:0", tokens={}, extra_field="nope") + + +def test_graph_edge_rejects_extra_fields(): + with pytest.raises(ValidationError): + GraphEdge(source_id="a", target_id="b", witnesses=set(), extra="nope") From fccd3ccf53d1ddf48d46a1b5dd8f0e43da0c2943 Mon Sep 17 00:00:00 2001 From: bsesic Date: Fri, 22 May 2026 20:47:51 +0200 Subject: [PATCH 03/27] feat(multi): add VariantGraph container VariantGraph holds a topologically sorted node list plus the edge list and the witness_ids that contribute to the alignment. START and END are sentinel nodes with empty tokens dicts. --- src/tracealign/multi/graph.py | 15 ++++++++++ tests/multi/test_variant_graph.py | 46 +++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 tests/multi/test_variant_graph.py diff --git a/src/tracealign/multi/graph.py b/src/tracealign/multi/graph.py index cb5018a..b64a4ea 100644 --- a/src/tracealign/multi/graph.py +++ b/src/tracealign/multi/graph.py @@ -28,3 +28,18 @@ class GraphEdge(BaseModel): source_id: str target_id: str witnesses: set[str] + + +class VariantGraph(BaseModel): + """A directed acyclic graph representing a multi-witness alignment. + + Nodes are topologically sorted with the START sentinel first and the END + sentinel last. Every witness path runs from START to END along edges + whose `witnesses` set contains the witness id. + """ + + model_config = ConfigDict(extra="forbid") + + nodes: list[GraphNode] + edges: list[GraphEdge] + witness_ids: list[str] diff --git a/tests/multi/test_variant_graph.py b/tests/multi/test_variant_graph.py new file mode 100644 index 0000000..71341d4 --- /dev/null +++ b/tests/multi/test_variant_graph.py @@ -0,0 +1,46 @@ +"""Tests for VariantGraph container and topological order.""" + +from tracealign.model import Token +from tracealign.multi.graph import GraphEdge, GraphNode, VariantGraph + + +def _tok(text: str, position: int = 0) -> Token: + return Token( + id=f"x:{position:06d}", + position=position, + raw=text, + text=text, + ) + + +def test_variant_graph_holds_nodes_and_edges(): + nodes = [ + GraphNode(id="START", tokens={}), + GraphNode(id="n:0", tokens={"W1": _tok("a")}), + GraphNode(id="END", tokens={}), + ] + edges = [ + GraphEdge(source_id="START", target_id="n:0", witnesses={"W1"}), + GraphEdge(source_id="n:0", target_id="END", witnesses={"W1"}), + ] + g = VariantGraph(nodes=nodes, edges=edges, witness_ids=["W1"]) + assert g.witness_ids == ["W1"] + assert len(g.nodes) == 3 + assert len(g.edges) == 2 + + +def test_variant_graph_first_node_is_start_last_is_end(): + g = VariantGraph( + nodes=[ + GraphNode(id="START", tokens={}), + GraphNode(id="n:0", tokens={"W1": _tok("a")}), + GraphNode(id="END", tokens={}), + ], + edges=[ + GraphEdge(source_id="START", target_id="n:0", witnesses={"W1"}), + GraphEdge(source_id="n:0", target_id="END", witnesses={"W1"}), + ], + witness_ids=["W1"], + ) + assert g.nodes[0].id == "START" + assert g.nodes[-1].id == "END" From 6989eed8805067b1b2b44e5165abfbb34fd1c97a Mon Sep 17 00:00:00 2001 From: bsesic Date: Fri, 22 May 2026 20:54:33 +0200 Subject: [PATCH 04/27] feat(multi): add VariantGraph.from_sequence and witness_path from_sequence builds the initial linear graph for a single witness; this is the seed of progressive_merge. witness_path walks the edges where a given witness appears, returning the non-sentinel nodes traversed; it is the basis of the lossless-reconstruction correctness property. --- src/tracealign/multi/graph.py | 45 +++++++++++++++++++++++++ tests/multi/test_variant_graph_paths.py | 45 +++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tests/multi/test_variant_graph_paths.py diff --git a/src/tracealign/multi/graph.py b/src/tracealign/multi/graph.py index b64a4ea..18b4808 100644 --- a/src/tracealign/multi/graph.py +++ b/src/tracealign/multi/graph.py @@ -43,3 +43,48 @@ class VariantGraph(BaseModel): nodes: list[GraphNode] edges: list[GraphEdge] witness_ids: list[str] + + @classmethod + def from_sequence(cls, witness_id: str, tokens: list[Token]) -> "VariantGraph": + """Build a linear graph for a single witness.""" + nodes: list[GraphNode] = [GraphNode(id="START", tokens={})] + for i, tok in enumerate(tokens): + nodes.append(GraphNode(id=f"n:{i:06d}", tokens={witness_id: tok})) + nodes.append(GraphNode(id="END", tokens={})) + + edges: list[GraphEdge] = [] + for i in range(len(nodes) - 1): + edges.append( + GraphEdge( + source_id=nodes[i].id, + target_id=nodes[i + 1].id, + witnesses={witness_id}, + ) + ) + + return cls(nodes=nodes, edges=edges, witness_ids=[witness_id]) + + def witness_path(self, witness_id: str) -> list[GraphNode]: + """Return the non-sentinel nodes traversed by `witness_id` in order.""" + # Build adjacency: source_id -> list[(target_id, witnesses)] + adj: dict[str, list[tuple[str, set[str]]]] = {} + for edge in self.edges: + adj.setdefault(edge.source_id, []).append((edge.target_id, edge.witnesses)) + + nodes_by_id = {n.id: n for n in self.nodes} + path: list[GraphNode] = [] + cur = "START" + while cur != "END": + next_id = None + for target_id, witnesses in adj.get(cur, []): + if witness_id in witnesses: + next_id = target_id + break + if next_id is None: + # No outgoing edge for this witness — should never happen for a + # consistent graph; treat as end of path. + break + if next_id != "END": + path.append(nodes_by_id[next_id]) + cur = next_id + return path diff --git a/tests/multi/test_variant_graph_paths.py b/tests/multi/test_variant_graph_paths.py new file mode 100644 index 0000000..b091c45 --- /dev/null +++ b/tests/multi/test_variant_graph_paths.py @@ -0,0 +1,45 @@ +"""Tests for VariantGraph.from_sequence and witness_path.""" + +from tracealign.model import Token +from tracealign.multi.graph import VariantGraph + + +def _tok(text: str, position: int = 0) -> Token: + return Token( + id=f"x:{position:06d}", + position=position, + raw=text, + text=text, + ) + + +def test_from_sequence_produces_linear_chain(): + seq = [_tok("a", 0), _tok("b", 1), _tok("c", 2)] + g = VariantGraph.from_sequence("W1", seq) + + # Three content nodes plus START and END = 5 total + assert len(g.nodes) == 5 + assert g.nodes[0].id == "START" + assert g.nodes[-1].id == "END" + assert g.witness_ids == ["W1"] + + # Edges: START -> n0 -> n1 -> n2 -> END, all carrying {"W1"} + assert len(g.edges) == 4 + for edge in g.edges: + assert edge.witnesses == {"W1"} + + +def test_witness_path_excludes_sentinels_and_reconstructs_input(): + seq = [_tok("a", 0), _tok("b", 1), _tok("c", 2)] + g = VariantGraph.from_sequence("W1", seq) + path = g.witness_path("W1") + + assert len(path) == 3 + assert [n.tokens["W1"].text for n in path] == ["a", "b", "c"] + assert all(n.id not in ("START", "END") for n in path) + + +def test_from_sequence_empty_sequence_gives_start_then_end(): + g = VariantGraph.from_sequence("W1", []) + assert [n.id for n in g.nodes] == ["START", "END"] + assert g.witness_path("W1") == [] From 3c86fe14315753498688dd4a4b5ea6585b3f8e6f Mon Sep 17 00:00:00 2001 From: bsesic Date: Fri, 22 May 2026 20:58:06 +0200 Subject: [PATCH 05/27] feat(multi): add VariantGraph.variants generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Yields the variant loci — non-sentinel nodes whose constituent witnesses disagree on the token text. Nodes carrying identical text from multiple witnesses count as agreement, not variation. --- src/tracealign/multi/graph.py | 9 ++++ tests/multi/test_variant_graph_variants.py | 60 ++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 tests/multi/test_variant_graph_variants.py diff --git a/src/tracealign/multi/graph.py b/src/tracealign/multi/graph.py index 18b4808..4d0d933 100644 --- a/src/tracealign/multi/graph.py +++ b/src/tracealign/multi/graph.py @@ -88,3 +88,12 @@ def witness_path(self, witness_id: str) -> list[GraphNode]: path.append(nodes_by_id[next_id]) cur = next_id return path + + def variants(self): + """Yield non-sentinel nodes whose witnesses disagree (>1 distinct token texts).""" + for node in self.nodes: + if node.id in ("START", "END"): + continue + texts = {t.text for t in node.tokens.values()} + if len(texts) > 1: + yield node diff --git a/tests/multi/test_variant_graph_variants.py b/tests/multi/test_variant_graph_variants.py new file mode 100644 index 0000000..f814cae --- /dev/null +++ b/tests/multi/test_variant_graph_variants.py @@ -0,0 +1,60 @@ +"""Tests for VariantGraph.variants — yields variant loci.""" + +from tracealign.model import Token +from tracealign.multi.graph import GraphEdge, GraphNode, VariantGraph + + +def _tok(text: str, position: int = 0) -> Token: + return Token( + id=f"x:{position:06d}", + position=position, + raw=text, + text=text, + ) + + +def test_variants_yields_nodes_with_distinct_token_texts(): + nodes = [ + GraphNode(id="START", tokens={}), + GraphNode(id="n:0", tokens={"W1": _tok("a"), "W2": _tok("a")}), + GraphNode(id="n:1", tokens={"W1": _tok("b"), "W2": _tok("c")}), + GraphNode(id="END", tokens={}), + ] + edges = [ + GraphEdge(source_id="START", target_id="n:0", witnesses={"W1", "W2"}), + GraphEdge(source_id="n:0", target_id="n:1", witnesses={"W1", "W2"}), + GraphEdge(source_id="n:1", target_id="END", witnesses={"W1", "W2"}), + ] + g = VariantGraph(nodes=nodes, edges=edges, witness_ids=["W1", "W2"]) + + variants = list(g.variants()) + assert len(variants) == 1 + assert variants[0].id == "n:1" + + +def test_variants_ignores_single_witness_nodes(): + nodes = [ + GraphNode(id="START", tokens={}), + GraphNode(id="n:0", tokens={"W1": _tok("a")}), # unique reading + GraphNode(id="END", tokens={}), + ] + edges = [ + GraphEdge(source_id="START", target_id="n:0", witnesses={"W1"}), + GraphEdge(source_id="n:0", target_id="END", witnesses={"W1"}), + ] + g = VariantGraph(nodes=nodes, edges=edges, witness_ids=["W1"]) + assert list(g.variants()) == [] + + +def test_variants_treats_identical_texts_as_agreement(): + # Same text from two witnesses is agreement, not a variant + node = GraphNode(id="n:0", tokens={"W1": _tok("שלום"), "W2": _tok("שלום")}) + g = VariantGraph( + nodes=[GraphNode(id="START", tokens={}), node, GraphNode(id="END", tokens={})], + edges=[ + GraphEdge(source_id="START", target_id="n:0", witnesses={"W1", "W2"}), + GraphEdge(source_id="n:0", target_id="END", witnesses={"W1", "W2"}), + ], + witness_ids=["W1", "W2"], + ) + assert list(g.variants()) == [] From 830afc64f2affd9650ef423b6270aa943f49deaf Mon Sep 17 00:00:00 2001 From: bsesic Date: Tue, 26 May 2026 17:50:00 +0200 Subject: [PATCH 06/27] feat(multi): add AlignedTable, TableColumn, TableCell types AlignedTable is the tabular view over a VariantGraph. Each TableCell carries either a Token or a None gap, plus a backreference to the VariantGraph node it came from. --- src/tracealign/multi/table.py | 37 +++++++++++++++++++++ tests/multi/test_aligned_table.py | 53 +++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 src/tracealign/multi/table.py create mode 100644 tests/multi/test_aligned_table.py diff --git a/src/tracealign/multi/table.py b/src/tracealign/multi/table.py new file mode 100644 index 0000000..9f955c8 --- /dev/null +++ b/src/tracealign/multi/table.py @@ -0,0 +1,37 @@ +"""Aligned table view derived from a VariantGraph.""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict + +from tracealign.model import Token + + +class TableCell(BaseModel): + """One cell in the aligned table — a token or a gap.""" + + model_config = ConfigDict(extra="forbid") + + token: Token | None + node_id: str | None + + +class TableColumn(BaseModel): + """One column of the aligned table — one cell per witness.""" + + model_config = ConfigDict(extra="forbid") + + cells: dict[str, TableCell] + + +class AlignedTable(BaseModel): + """Tabular view over a VariantGraph. + + Rows correspond to witnesses; columns to aligned positions. Cells whose + token is None represent gaps relative to the column's consensus. + """ + + model_config = ConfigDict(extra="forbid") + + witnesses: list[str] + columns: list[TableColumn] diff --git a/tests/multi/test_aligned_table.py b/tests/multi/test_aligned_table.py new file mode 100644 index 0000000..4a7ce62 --- /dev/null +++ b/tests/multi/test_aligned_table.py @@ -0,0 +1,53 @@ +"""Tests for AlignedTable basic structure.""" + +from tracealign.model import Token +from tracealign.multi.table import AlignedTable, TableCell, TableColumn + + +def _tok(text: str, position: int = 0) -> Token: + return Token( + id=f"x:{position:06d}", + position=position, + raw=text, + text=text, + ) + + +def test_table_cell_carries_token_and_node_id(): + cell = TableCell(token=_tok("a"), node_id="n:0") + assert cell.token is not None + assert cell.token.text == "a" + assert cell.node_id == "n:0" + + +def test_table_cell_gap_has_none_token(): + cell = TableCell(token=None, node_id=None) + assert cell.token is None + assert cell.node_id is None + + +def test_table_column_holds_cells_per_witness(): + col = TableColumn( + cells={ + "W1": TableCell(token=_tok("a"), node_id="n:0"), + "W2": TableCell(token=None, node_id=None), + } + ) + assert col.cells["W1"].token.text == "a" + assert col.cells["W2"].token is None + + +def test_aligned_table_has_witnesses_and_columns(): + table = AlignedTable( + witnesses=["W1", "W2"], + columns=[ + TableColumn( + cells={ + "W1": TableCell(token=_tok("a"), node_id="n:0"), + "W2": TableCell(token=_tok("a"), node_id="n:0"), + } + ) + ], + ) + assert table.witnesses == ["W1", "W2"] + assert len(table.columns) == 1 From c1db620b0eaca6977a89e26c29018855e78765a5 Mon Sep 17 00:00:00 2001 From: bsesic Date: Tue, 26 May 2026 17:58:07 +0200 Subject: [PATCH 07/27] feat(multi): add AlignedTable.re_anchor and format_text re_anchor returns a new table with the requested witness as the first row; columns and cells are preserved verbatim. format_text renders the table as ASCII for inspection, with a gap marker for missing cells. --- src/tracealign/multi/table.py | 47 +++++++++++++++ tests/multi/test_aligned_table_anchor.py | 74 ++++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 tests/multi/test_aligned_table_anchor.py diff --git a/src/tracealign/multi/table.py b/src/tracealign/multi/table.py index 9f955c8..da98473 100644 --- a/src/tracealign/multi/table.py +++ b/src/tracealign/multi/table.py @@ -35,3 +35,50 @@ class AlignedTable(BaseModel): witnesses: list[str] columns: list[TableColumn] + + def re_anchor(self, base_witness: str) -> "AlignedTable": + """Return a new AlignedTable where `base_witness` is rendered first. + + Only the row order changes; cells per (witness, column) are preserved + unchanged. This is purely a presentation transform. + """ + if base_witness not in self.witnesses: + raise ValueError(f"unknown witness: {base_witness}") + + new_witnesses = [base_witness] + [w for w in self.witnesses if w != base_witness] + return AlignedTable(witnesses=new_witnesses, columns=self.columns) + + def format_text(self, max_columns: int = 80) -> str: + """ASCII rendering of the aligned table. + + Each row is one witness; gaps are shown as a centred dash. The + rendering truncates each column to fit `max_columns` total width. + """ + gap_marker = "—" + col_widths: list[int] = [] + for col in self.columns: + widest = max( + (len(c.token.text) if c.token is not None else len(gap_marker)) + for c in col.cells.values() + ) + col_widths.append(max(widest, len(gap_marker))) + + # Truncate to fit max_columns; total width = sum(col_widths) + len(cols) + rows: list[str] = [] + label_width = max(len(w) for w in self.witnesses) + for w in self.witnesses: + parts = [f"{w:<{label_width}}"] + running = label_width + 1 + for col, width in zip(self.columns, col_widths): + cell = col.cells.get(w) + if cell is None or cell.token is None: + text = gap_marker + else: + text = cell.token.text + if running + width + 1 > max_columns: + parts.append("…") + break + parts.append(f"{text:<{width}}") + running += width + 1 + rows.append(" ".join(parts)) + return "\n".join(rows) diff --git a/tests/multi/test_aligned_table_anchor.py b/tests/multi/test_aligned_table_anchor.py new file mode 100644 index 0000000..db60ea1 --- /dev/null +++ b/tests/multi/test_aligned_table_anchor.py @@ -0,0 +1,74 @@ +"""Tests for AlignedTable.re_anchor and format_text.""" + +from tracealign.model import Token +from tracealign.multi.table import AlignedTable, TableCell, TableColumn + + +def _tok(text: str, position: int = 0) -> Token: + return Token( + id=f"x:{position:06d}", + position=position, + raw=text, + text=text, + ) + + +def _make_table() -> AlignedTable: + return AlignedTable( + witnesses=["W1", "W2"], + columns=[ + TableColumn( + cells={ + "W1": TableCell(token=_tok("a"), node_id="n:0"), + "W2": TableCell(token=_tok("a"), node_id="n:0"), + } + ), + TableColumn( + cells={ + "W1": TableCell(token=_tok("b"), node_id="n:1"), + "W2": TableCell(token=None, node_id=None), + } + ), + ], + ) + + +def test_re_anchor_moves_base_witness_to_front(): + table = _make_table() + re = table.re_anchor("W2") + assert re.witnesses[0] == "W2" + assert set(re.witnesses) == {"W1", "W2"} + + +def test_re_anchor_preserves_alignment_relationships(): + table = _make_table() + re = table.re_anchor("W2") + # The cells per (witness, column) must remain consistent; re_anchor only + # changes display order, not which token belongs to which witness at + # which column. + for original, anchored in zip(table.columns, re.columns): + for wid in ("W1", "W2"): + orig = original.cells[wid] + new = anchored.cells[wid] + assert orig.token == new.token + assert orig.node_id == new.node_id + + +def test_re_anchor_to_unknown_witness_raises(): + table = _make_table() + try: + table.re_anchor("W99") + raised = False + except ValueError: + raised = True + assert raised + + +def test_format_text_renders_columns_with_witnesses(): + table = _make_table() + rendered = table.format_text() + # Witness labels appear as row prefixes + assert "W1" in rendered + assert "W2" in rendered + # Gap is shown as a placeholder + assert "—" in rendered or "-" in rendered or "·" in rendered From 4a33997b7127d7576d27dbb4fabf44d59d82be90 Mon Sep 17 00:00:00 2001 From: bsesic Date: Tue, 26 May 2026 18:00:47 +0200 Subject: [PATCH 08/27] feat(multi): add GuideTreeNode, GuideTree, and format_text Pydantic recursive type for the binary UPGMA tree; the GuideTree carries the method, the original distance matrix, and the witness_ids labelling the matrix rows. format_text renders the tree as an indented ASCII tree. --- src/tracealign/multi/guide_tree.py | 54 +++++++++++++++++++++++ tests/multi/test_guide_tree_types.py | 64 ++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 src/tracealign/multi/guide_tree.py create mode 100644 tests/multi/test_guide_tree_types.py diff --git a/src/tracealign/multi/guide_tree.py b/src/tracealign/multi/guide_tree.py new file mode 100644 index 0000000..5d3f01b --- /dev/null +++ b/src/tracealign/multi/guide_tree.py @@ -0,0 +1,54 @@ +"""Guide tree types built from a pairwise distance matrix.""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict + + +class GuideTreeNode(BaseModel): + """One node in a binary guide tree. + + Leaves have `is_leaf=True`, `witness_id` set, and no children. Internal + nodes have `is_leaf=False`, `witness_id=None`, and exactly two children + (binary tree from UPGMA). + """ + + model_config = ConfigDict(extra="forbid") + + is_leaf: bool + witness_id: str | None + children: list["GuideTreeNode"] + height: float + + +GuideTreeNode.model_rebuild() + + +class GuideTree(BaseModel): + """A guide tree plus the distance matrix that generated it. + + The distance matrix is kept on the tree so that downstream stages (e.g. + Stage 7 stemmatic reconstruction) can reuse it without recomputation. + """ + + model_config = ConfigDict(extra="forbid") + + root: GuideTreeNode + method: str + distance_matrix: list[list[float]] + witness_ids: list[str] + + def format_text(self) -> str: + """Render the tree as an indented ASCII listing.""" + lines: list[str] = [] + self._render(self.root, lines, depth=0) + return "\n".join(lines) + + def _render(self, node: GuideTreeNode, lines: list[str], depth: int) -> None: + indent = " " * depth + if node.is_leaf: + lines.append(f"{indent}- {node.witness_id} (h={node.height:.4f})") + else: + lines.append(f"{indent}+ (h={node.height:.4f})") + for child in node.children: + self._render(child, lines, depth + 1) diff --git a/tests/multi/test_guide_tree_types.py b/tests/multi/test_guide_tree_types.py new file mode 100644 index 0000000..fe1e5f2 --- /dev/null +++ b/tests/multi/test_guide_tree_types.py @@ -0,0 +1,64 @@ +"""Tests for the GuideTree data types and format_text.""" + +from tracealign.multi.guide_tree import GuideTree, GuideTreeNode + + +def _leaf(wid: str) -> GuideTreeNode: + return GuideTreeNode(is_leaf=True, witness_id=wid, children=[], height=0.0) + + +def test_guide_tree_leaf(): + leaf = _leaf("W1") + assert leaf.is_leaf is True + assert leaf.witness_id == "W1" + assert leaf.children == [] + assert leaf.height == 0.0 + + +def test_guide_tree_internal_node(): + node = GuideTreeNode( + is_leaf=False, + witness_id=None, + children=[_leaf("W1"), _leaf("W2")], + height=0.5, + ) + assert node.is_leaf is False + assert node.witness_id is None + assert len(node.children) == 2 + assert node.height == 0.5 + + +def test_guide_tree_with_distance_matrix(): + tree = GuideTree( + root=GuideTreeNode( + is_leaf=False, + witness_id=None, + children=[_leaf("W1"), _leaf("W2")], + height=0.25, + ), + method="upgma", + distance_matrix=[[0.0, 0.5], [0.5, 0.0]], + witness_ids=["W1", "W2"], + ) + assert tree.method == "upgma" + assert tree.distance_matrix == [[0.0, 0.5], [0.5, 0.0]] + assert tree.witness_ids == ["W1", "W2"] + + +def test_format_text_renders_indented_tree(): + tree = GuideTree( + root=GuideTreeNode( + is_leaf=False, + witness_id=None, + children=[_leaf("W1"), _leaf("W2")], + height=0.25, + ), + method="upgma", + distance_matrix=[[0.0, 0.5], [0.5, 0.0]], + witness_ids=["W1", "W2"], + ) + rendered = tree.format_text() + assert "W1" in rendered + assert "W2" in rendered + # Some indication of structure (height or indentation) + assert "0.25" in rendered or "0.5" in rendered From 874f7f26fdf2c4dd1605da494ff5c66d377b1449 Mon Sep 17 00:00:00 2001 From: bsesic Date: Tue, 26 May 2026 18:42:15 +0200 Subject: [PATCH 09/27] feat(multi): add pairwise_distances using v0.1 align() Computes the N x N symmetric distance matrix as 1 - total_score of each pairwise alignment. Returns the matrix plus the canonical (sorted) witness_id ordering used for rows and columns, guaranteeing input-order independence. --- src/tracealign/multi/distance.py | 32 +++++++++++++++++++++ tests/multi/test_distance.py | 49 ++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 src/tracealign/multi/distance.py create mode 100644 tests/multi/test_distance.py diff --git a/src/tracealign/multi/distance.py b/src/tracealign/multi/distance.py new file mode 100644 index 0000000..353f198 --- /dev/null +++ b/src/tracealign/multi/distance.py @@ -0,0 +1,32 @@ +"""Pairwise distance matrix for multi-witness alignment.""" + +from __future__ import annotations + +import numpy as np + +from tracealign.align import AlignerConfig +from tracealign.align import align as pairwise_align +from tracealign.lang.base import LanguagePack +from tracealign.model import Token + + +def pairwise_distances( + witnesses: dict[str, list[Token]], + pack: LanguagePack, + pairwise_cfg: AlignerConfig, +) -> tuple[np.ndarray, list[str]]: + """Compute the N x N pairwise distance matrix using v0.1's pairwise aligner. + + Returns the matrix and the canonical witness_id ordering (sorted + lexicographically) used for rows and columns. + """ + wids = sorted(witnesses.keys()) + n = len(wids) + D = np.zeros((n, n), dtype=np.float64) + for i in range(n): + for j in range(i + 1, n): + result = pairwise_align(witnesses[wids[i]], witnesses[wids[j]], pack, pairwise_cfg) + d = 1.0 - result.total_score + D[i, j] = d + D[j, i] = d + return D, wids diff --git a/tests/multi/test_distance.py b/tests/multi/test_distance.py new file mode 100644 index 0000000..abc50ad --- /dev/null +++ b/tests/multi/test_distance.py @@ -0,0 +1,49 @@ +"""Tests for the pairwise distance matrix.""" + +import numpy as np + +import tracealign +from tracealign.align import AlignerConfig +from tracealign.multi.distance import pairwise_distances + + +def test_pairwise_distances_identical_witnesses_distance_zero(): + a = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + b = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2") + pack = tracealign.get_language("hbo") + matrix, wids = pairwise_distances({"W1": a, "W2": b}, pack, AlignerConfig()) + + assert wids == ["W1", "W2"] + assert matrix.shape == (2, 2) + assert matrix[0, 0] == 0.0 + assert matrix[1, 1] == 0.0 + assert abs(matrix[0, 1] - 0.0) < 1e-9 + assert matrix[0, 1] == matrix[1, 0] + + +def test_pairwise_distances_disjoint_witnesses_distance_near_one(): + a = tracealign.tokenize("aaa bbb ccc", lang="hbo", seq_label="W1") + b = tracealign.tokenize("xxx yyy zzz", lang="hbo", seq_label="W2") + pack = tracealign.get_language("hbo") + matrix, _ = pairwise_distances({"W1": a, "W2": b}, pack, AlignerConfig()) + assert matrix[0, 1] > 0.5 + + +def test_pairwise_distances_sorts_witness_ids_canonically(): + # dict insertion order is preserved by Python, but pairwise_distances + # must sort to guarantee deterministic output regardless of input order. + a = tracealign.tokenize("שלום", lang="hbo", seq_label="W1") + b = tracealign.tokenize("שלום", lang="hbo", seq_label="W2") + pack = tracealign.get_language("hbo") + _, wids_ab = pairwise_distances({"W2": b, "W1": a}, pack, AlignerConfig()) + _, wids_ba = pairwise_distances({"W1": a, "W2": b}, pack, AlignerConfig()) + assert wids_ab == wids_ba == ["W1", "W2"] + + +def test_pairwise_distances_symmetric(): + a = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + b = tracealign.tokenize("שלום אחר", lang="hbo", seq_label="W2") + pack = tracealign.get_language("hbo") + matrix, _ = pairwise_distances({"W1": a, "W2": b}, pack, AlignerConfig()) + assert matrix[0, 1] == matrix[1, 0] + np.testing.assert_array_equal(np.diag(matrix), np.zeros(2)) From b4a07722456b1b2ac41e4c53e6dfd0e0afa33320 Mon Sep 17 00:00:00 2001 From: bsesic Date: Wed, 27 May 2026 10:52:53 +0200 Subject: [PATCH 10/27] feat(multi): add UPGMA tree construction with deterministic tie-breaking build_upgma builds a binary GuideTree from a symmetric distance matrix. Ties on minimum distance are broken on the canonical (min, max) lexicographic order of the cluster members, making the algorithm order-independent at the input level. --- src/tracealign/multi/guide_tree.py | 104 +++++++++++++++++++++++++++++ tests/multi/test_upgma.py | 65 ++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 tests/multi/test_upgma.py diff --git a/src/tracealign/multi/guide_tree.py b/src/tracealign/multi/guide_tree.py index 5d3f01b..2dfe3b9 100644 --- a/src/tracealign/multi/guide_tree.py +++ b/src/tracealign/multi/guide_tree.py @@ -2,6 +2,7 @@ from __future__ import annotations +import numpy as np from pydantic import BaseModel, ConfigDict @@ -52,3 +53,106 @@ def _render(self, node: GuideTreeNode, lines: list[str], depth: int) -> None: lines.append(f"{indent}+ (h={node.height:.4f})") for child in node.children: self._render(child, lines, depth + 1) + + +def build_upgma(distance_matrix: "np.ndarray", witness_ids: list[str]) -> GuideTree: + """Build a UPGMA guide tree from a symmetric distance matrix. + + Ties are broken on the (min, max) lexicographic order of the cluster + members, guaranteeing determinism regardless of input order. + """ + n = len(witness_ids) + + # Cluster representation: a list of (cluster_node, member_witness_ids_set) + clusters: list[tuple[GuideTreeNode, set[str]]] = [] + for wid in witness_ids: + leaf = GuideTreeNode(is_leaf=True, witness_id=wid, children=[], height=0.0) + clusters.append((leaf, {wid})) + + # Working distance matrix as a plain dict keyed by frozenset pair + D: dict[frozenset[str], float] = {} + for i in range(n): + for j in range(i + 1, n): + D[frozenset([witness_ids[i], witness_ids[j]])] = float(distance_matrix[i, j]) + + def _cluster_min_label(members: set[str]) -> str: + return min(members) + + def _cluster_pair_key(a_members: set[str], b_members: set[str]) -> tuple[str, str]: + # Canonical (min, max) sorted by lexicographic order + a_lo = _cluster_min_label(a_members) + b_lo = _cluster_min_label(b_members) + return tuple(sorted([a_lo, b_lo])) + + while len(clusters) > 1: + # Find the pair with the smallest distance; tie-break on canonical key + best_pair: tuple[int, int] | None = None + best_dist = float("inf") + best_key: tuple[str, str] | None = None + for i in range(len(clusters)): + for j in range(i + 1, len(clusters)): + members_i = clusters[i][1] + members_j = clusters[j][1] + pair_key = frozenset(members_i | members_j) + dist = D[pair_key] if pair_key in D \ + else _avg_distance(members_i, members_j, D) + key = _cluster_pair_key(members_i, members_j) + if dist < best_dist or (dist == best_dist and (best_key is None or key < best_key)): + best_dist = dist + best_pair = (i, j) + best_key = key + + i, j = best_pair # type: ignore[misc] + node_i, members_i = clusters[i] + node_j, members_j = clusters[j] + + # Order children deterministically: smaller min-member first + if _cluster_min_label(members_i) <= _cluster_min_label(members_j): + children = [node_i, node_j] + else: + children = [node_j, node_i] + + merged_node = GuideTreeNode( + is_leaf=False, + witness_id=None, + children=children, + height=best_dist / 2.0, + ) + merged_members = members_i | members_j + + # Update D with new cluster distances + for k, (_, members_k) in enumerate(clusters): + if k == i or k == j: + continue + d_ik = _avg_distance(members_i, members_k, D) + d_jk = _avg_distance(members_j, members_k, D) + new_d = (d_ik * len(members_i) + d_jk * len(members_k)) / ( + len(members_i) + len(members_j) + ) + D[frozenset(merged_members | members_k)] = new_d + + # Remove old clusters, insert merged + new_clusters = [] + for k, entry in enumerate(clusters): + if k != i and k != j: + new_clusters.append(entry) + new_clusters.append((merged_node, merged_members)) + clusters = new_clusters + + root = clusters[0][0] + return GuideTree( + root=root, + method="upgma", + distance_matrix=distance_matrix.tolist(), + witness_ids=witness_ids, + ) + + +def _avg_distance(a: set[str], b: set[str], D: dict[frozenset[str], float]) -> float: + total = 0.0 + count = 0 + for x in a: + for y in b: + total += D[frozenset([x, y])] + count += 1 + return total / count if count else 0.0 diff --git a/tests/multi/test_upgma.py b/tests/multi/test_upgma.py new file mode 100644 index 0000000..2685a68 --- /dev/null +++ b/tests/multi/test_upgma.py @@ -0,0 +1,65 @@ +"""Tests for UPGMA construction.""" + +import numpy as np + +from tracealign.multi.guide_tree import GuideTree, build_upgma + + +def test_upgma_two_witnesses_single_merge(): + D = np.array([[0.0, 0.5], [0.5, 0.0]]) + tree = build_upgma(D, ["W1", "W2"]) + assert isinstance(tree, GuideTree) + assert tree.method == "upgma" + assert tree.root.is_leaf is False + assert len(tree.root.children) == 2 + # Height at root is half the merge distance + assert abs(tree.root.height - 0.25) < 1e-9 + # Both witnesses are present as leaves + leaves = {c.witness_id for c in tree.root.children} + assert leaves == {"W1", "W2"} + + +def test_upgma_three_witnesses_closest_pair_merges_first(): + D = np.array([ + [0.0, 0.1, 0.5], + [0.1, 0.0, 0.5], + [0.5, 0.5, 0.0], + ]) + tree = build_upgma(D, ["A", "B", "C"]) + # Closest pair (A, B) merges first, then C joins. + # The root's children are the (AB) subtree and the C leaf. + root = tree.root + assert root.is_leaf is False + assert len(root.children) == 2 + leaf_children = [c for c in root.children if c.is_leaf] + inner_children = [c for c in root.children if not c.is_leaf] + assert len(leaf_children) == 1 + assert leaf_children[0].witness_id == "C" + assert len(inner_children) == 1 + inner = inner_children[0] + inner_leaves = {c.witness_id for c in inner.children} + assert inner_leaves == {"A", "B"} + + +def test_upgma_tie_breaking_uses_sorted_witness_ids(): + # D(A, B) == D(A, C), both 0.2. Tie-breaking should pick (A, B) because + # (min, max) lexicographic order favours it over (A, C). + D = np.array([ + [0.0, 0.2, 0.2], + [0.2, 0.0, 0.4], + [0.2, 0.4, 0.0], + ]) + tree = build_upgma(D, ["A", "B", "C"]) + # The first merge must combine A and B (lexicographic tie-break) + root = tree.root + inner = [c for c in root.children if not c.is_leaf] + assert len(inner) == 1 + inner_leaves = {c.witness_id for c in inner[0].children} + assert inner_leaves == {"A", "B"} + + +def test_upgma_preserves_distance_matrix_on_tree(): + D = np.array([[0.0, 0.5], [0.5, 0.0]]) + tree = build_upgma(D, ["W1", "W2"]) + assert tree.distance_matrix == [[0.0, 0.5], [0.5, 0.0]] + assert tree.witness_ids == ["W1", "W2"] From d1dad504902e07149f423bbccfa836b43c9bc46c Mon Sep 17 00:00:00 2001 From: bsesic Date: Wed, 27 May 2026 11:33:00 +0200 Subject: [PATCH 11/27] feat(multi): add post_order_witness_ids traversal helper Returns the witness ids in canonical post-order traversal of the binary guide tree. progressive_merge uses this as its canonical merge order, so witnesses that are similar (siblings or close cousins in the tree) are merged early, building consensus structure before distant witnesses are added. --- src/tracealign/multi/guide_tree.py | 16 ++++++++++ tests/multi/test_guide_tree_traversal.py | 40 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 tests/multi/test_guide_tree_traversal.py diff --git a/src/tracealign/multi/guide_tree.py b/src/tracealign/multi/guide_tree.py index 2dfe3b9..7944860 100644 --- a/src/tracealign/multi/guide_tree.py +++ b/src/tracealign/multi/guide_tree.py @@ -156,3 +156,19 @@ def _avg_distance(a: set[str], b: set[str], D: dict[frozenset[str], float]) -> f total += D[frozenset([x, y])] count += 1 return total / count if count else 0.0 + + +def post_order_witness_ids(tree: GuideTree) -> list[str]: + """Return the witness ids in canonical post-order traversal of the tree.""" + out: list[str] = [] + _post_order(tree.root, out) + return out + + +def _post_order(node: GuideTreeNode, out: list[str]) -> None: + if node.is_leaf: + if node.witness_id is not None: + out.append(node.witness_id) + return + for child in node.children: + _post_order(child, out) diff --git a/tests/multi/test_guide_tree_traversal.py b/tests/multi/test_guide_tree_traversal.py new file mode 100644 index 0000000..60048f6 --- /dev/null +++ b/tests/multi/test_guide_tree_traversal.py @@ -0,0 +1,40 @@ +"""Tests for post_order_witness_ids traversal.""" + +import numpy as np + +from tracealign.multi.guide_tree import build_upgma, post_order_witness_ids + + +def test_post_order_two_witnesses(): + D = np.array([[0.0, 0.5], [0.5, 0.0]]) + tree = build_upgma(D, ["W1", "W2"]) + order = post_order_witness_ids(tree) + assert sorted(order) == ["W1", "W2"] + assert len(order) == 2 + + +def test_post_order_three_witnesses_closest_pair_adjacent(): + D = np.array([ + [0.0, 0.1, 0.5], + [0.1, 0.0, 0.5], + [0.5, 0.5, 0.0], + ]) + tree = build_upgma(D, ["A", "B", "C"]) + order = post_order_witness_ids(tree) + assert set(order) == {"A", "B", "C"} + assert len(order) == 3 + # A and B (closest) appear adjacent in the order + a_idx = order.index("A") + b_idx = order.index("B") + assert abs(a_idx - b_idx) == 1 + + +def test_post_order_is_deterministic(): + D = np.array([ + [0.0, 0.2, 0.2], + [0.2, 0.0, 0.4], + [0.2, 0.4, 0.0], + ]) + tree1 = build_upgma(D, ["A", "B", "C"]) + tree2 = build_upgma(D, ["A", "B", "C"]) + assert post_order_witness_ids(tree1) == post_order_witness_ids(tree2) From e080ba8764bd5b34f32828d73f6d125a2923ef3c Mon Sep 17 00:00:00 2001 From: bsesic Date: Wed, 27 May 2026 11:57:07 +0200 Subject: [PATCH 12/27] feat(multi): add node_match_score aggregation for sequence-vs-graph Aggregates the per-constituent tiered score across the witnesses already in a graph node. Default mode 'max' is permissive (CollateX-style); 'mean' and 'min' are also available via MultiAlignerConfig.node_match. --- src/tracealign/multi/merge.py | 37 ++++++++++++++++++++++++ tests/multi/test_node_match_score.py | 42 ++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 src/tracealign/multi/merge.py create mode 100644 tests/multi/test_node_match_score.py diff --git a/src/tracealign/multi/merge.py b/src/tracealign/multi/merge.py new file mode 100644 index 0000000..e40c230 --- /dev/null +++ b/src/tracealign/multi/merge.py @@ -0,0 +1,37 @@ +"""Multi-witness POA merge: sequence-vs-graph alignment + graph update.""" + +from __future__ import annotations + +import statistics + +from tracealign.align.needleman_wunsch import _dp_score +from tracealign.lang.base import LanguagePack +from tracealign.model import Token +from tracealign.multi.graph import GraphNode +from tracealign.score.tiered import tiered_score + + +def node_match_score( + token: Token, + node: GraphNode, + pack: LanguagePack, + mode: str = "max", +) -> float: + """Aggregate score for matching `token` against the constituents of `node`. + + `mode` is one of "max" (default), "mean", or "min". The per-constituent + score is the tiered pairwise score mapped to the DP scale [-1, +1] by + the same convention as v0.1's pairwise aligner. + """ + if not node.tokens: + # Sentinel node — never matches a real token + return float("-inf") + + scores = [_dp_score(tiered_score(token, t, pack).score) for t in node.tokens.values()] + if mode == "max": + return max(scores) + if mode == "min": + return min(scores) + if mode == "mean": + return statistics.mean(scores) + raise ValueError(f"unknown node_match mode: {mode}") diff --git a/tests/multi/test_node_match_score.py b/tests/multi/test_node_match_score.py new file mode 100644 index 0000000..91e30b8 --- /dev/null +++ b/tests/multi/test_node_match_score.py @@ -0,0 +1,42 @@ +"""Tests for node_match_score aggregation.""" + +import tracealign +from tracealign.multi.graph import GraphNode +from tracealign.multi.merge import node_match_score + + +def _tokens(text: str): + return tracealign.tokenize(text, lang="hbo", seq_label="seq") + + +def test_node_match_score_exact_match_max(): + pack = tracealign.get_language("hbo") + tok = _tokens("שלום")[0] + node = GraphNode(id="n:0", tokens={"W1": tok, "W2": tok}) + score = node_match_score(tok, node, pack, mode="max") + # _dp_score(1.0) = 1.0 + assert score == 1.0 + + +def test_node_match_score_max_picks_best_constituent(): + pack = tracealign.get_language("hbo") + a = _tokens("שלום")[0] + b = _tokens("aaa")[0] # very different + new = a + node = GraphNode(id="n:0", tokens={"W1": a, "W2": b}) + s_max = node_match_score(new, node, pack, mode="max") + s_min = node_match_score(new, node, pack, mode="min") + s_mean = node_match_score(new, node, pack, mode="mean") + assert s_max >= s_mean >= s_min + + +def test_node_match_score_rejects_unknown_mode(): + pack = tracealign.get_language("hbo") + tok = _tokens("שלום")[0] + node = GraphNode(id="n:0", tokens={"W1": tok}) + try: + node_match_score(tok, node, pack, mode="median") + raised = False + except ValueError: + raised = True + assert raised From db0d7b6c2b88f8bd10fdac77095d86f5a8ccf399 Mon Sep 17 00:00:00 2001 From: bsesic Date: Wed, 27 May 2026 14:59:34 +0200 Subject: [PATCH 13/27] feat(multi): add topological-order helper for POA DP Stable topological sort over graph nodes; ties broken by sorted node id. Provides the canonical traversal order for the POA DP that follows. --- src/tracealign/multi/merge.py | 28 ++++++++++++++++++++++++++++ tests/multi/test_merge_dp_setup.py | 24 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 tests/multi/test_merge_dp_setup.py diff --git a/src/tracealign/multi/merge.py b/src/tracealign/multi/merge.py index e40c230..d9c3c80 100644 --- a/src/tracealign/multi/merge.py +++ b/src/tracealign/multi/merge.py @@ -35,3 +35,31 @@ def node_match_score( if mode == "mean": return statistics.mean(scores) raise ValueError(f"unknown node_match mode: {mode}") + + +def _topological_node_ids(graph) -> list[str]: + """Kahn's algorithm topological sort over the graph's nodes. + + Returns node ids in topological order, with stable ordering by node id + among nodes with the same depth to keep the algorithm deterministic. + """ + incoming: dict[str, set[str]] = {n.id: set() for n in graph.nodes} + outgoing: dict[str, list[str]] = {n.id: [] for n in graph.nodes} + for edge in graph.edges: + incoming[edge.target_id].add(edge.source_id) + outgoing[edge.source_id].append(edge.target_id) + + # Sources have no incoming edges + ready = sorted([nid for nid, srcs in incoming.items() if not srcs]) + out: list[str] = [] + while ready: + nid = ready.pop(0) + out.append(nid) + # Outgoing edges sorted for determinism + for tgt in sorted(outgoing[nid]): + incoming[tgt].discard(nid) + if not incoming[tgt]: + # Insert into ready in sorted order + import bisect + bisect.insort(ready, tgt) + return out diff --git a/tests/multi/test_merge_dp_setup.py b/tests/multi/test_merge_dp_setup.py new file mode 100644 index 0000000..1aeb3eb --- /dev/null +++ b/tests/multi/test_merge_dp_setup.py @@ -0,0 +1,24 @@ +"""Tests for the DP setup phase of align_sequence_to_graph.""" + +import tracealign +from tracealign.multi.graph import VariantGraph +from tracealign.multi.merge import _topological_node_ids + + +def test_topological_order_starts_with_start_ends_with_end(): + seq = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + g = VariantGraph.from_sequence("W1", seq) + order = _topological_node_ids(g) + assert order[0] == "START" + assert order[-1] == "END" + # All node ids must appear exactly once + assert sorted(order) == sorted(n.id for n in g.nodes) + + +def test_topological_order_respects_edges(): + seq = tracealign.tokenize("a b c", lang="hbo", seq_label="W1") + g = VariantGraph.from_sequence("W1", seq) + order = _topological_node_ids(g) + idx = {nid: i for i, nid in enumerate(order)} + for edge in g.edges: + assert idx[edge.source_id] < idx[edge.target_id] From 9e392f57ac6b13a1a4fc7480f65b263d5ce1646a Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 09:44:11 +0200 Subject: [PATCH 14/27] feat(multi): add POA forward DP for sequence-vs-graph alignment Implements the three POA transitions (match, insertion in sequence, deletion of graph node) over a topologically ordered DAG. Returns the DP table, backpointer table, and best score; traceback follows in the next task. Sentinel transitions are free so that reaching END after exactly m consumed sequence tokens does not pay an extra gap penalty. --- src/tracealign/multi/merge.py | 102 +++++++++++++++++++++++++++++++ tests/multi/test_merge_dp_run.py | 41 +++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 tests/multi/test_merge_dp_run.py diff --git a/src/tracealign/multi/merge.py b/src/tracealign/multi/merge.py index d9c3c80..fc398d7 100644 --- a/src/tracealign/multi/merge.py +++ b/src/tracealign/multi/merge.py @@ -4,6 +4,7 @@ import statistics +from tracealign.align import AlignerConfig from tracealign.align.needleman_wunsch import _dp_score from tracealign.lang.base import LanguagePack from tracealign.model import Token @@ -11,6 +12,9 @@ from tracealign.score.tiered import tiered_score +NEG_INF = float("-inf") + + def node_match_score( token: Token, node: GraphNode, @@ -63,3 +67,101 @@ def _topological_node_ids(graph) -> list[str]: import bisect bisect.insort(ready, tgt) return out + + +def _run_poa_dp( + seq, + graph, + pack: LanguagePack, + pairwise_cfg: AlignerConfig, + node_match_mode: str, + gap_penalty: float, +) -> dict: + """Forward DP for sequence-vs-graph alignment. + + Implements the three POA transitions over a topologically ordered DAG: + + * match — consume ``seq[i]`` and advance from a predecessor of ``nid`` + to ``nid``; scored by :func:`node_match_score`. + * insert — consume ``seq[i]`` but stay at ``nid``; cost ``gap_penalty``. + * delete — advance from a predecessor of ``nid`` to ``nid`` without + consuming a sequence token; cost ``gap_penalty`` for real nodes, + free when either endpoint is the START or END sentinel. + + Sentinel transitions are free so that reaching END after exactly ``m`` + consumed sequence tokens does not pay an extra penalty. + + Returns a dict with: + * ``dp`` — ``dp[i][node_id]`` = best score reaching ``node_id`` + after consuming ``i`` sequence tokens. + * ``bp`` — ``bp[i][node_id]`` = ``(op, prev_i, prev_node_id)`` + backpointer or ``None``. + * ``best_score`` — the score at the END sentinel at ``i = len(seq)``. + * ``topo`` — topological list of node ids. + """ + topo = _topological_node_ids(graph) + nodes_by_id = {n.id: n for n in graph.nodes} + + incoming: dict[str, list[str]] = {nid: [] for nid in topo} + for edge in graph.edges: + incoming[edge.target_id].append(edge.source_id) + # Sort predecessors for determinism + for nid in incoming: + incoming[nid].sort() + + m = len(seq) + dp: dict[int, dict[str, float]] = { + i: {nid: NEG_INF for nid in topo} for i in range(m + 1) + } + bp: dict[int, dict[str, tuple[str, int, str] | None]] = { + i: {nid: None for nid in topo} for i in range(m + 1) + } + + # Start: dp[0][START] = 0 + dp[0]["START"] = 0.0 + + def _is_sentinel(nid: str) -> bool: + return nid in ("START", "END") + + for i in range(m + 1): + for nid in topo: + node = nodes_by_id[nid] + + # Delete (skip this node): advance from prev_nid to nid without + # consuming a sequence token. Free at sentinels, otherwise + # gap_penalty. Processed first so match/insert at the same i can + # read the updated dp[i][nid]. + for prev_nid in incoming[nid]: + if dp[i][prev_nid] == NEG_INF: + continue + step = 0.0 if (_is_sentinel(nid) or _is_sentinel(prev_nid)) else gap_penalty + cand = dp[i][prev_nid] + step + if cand > dp[i][nid]: + dp[i][nid] = cand + bp[i][nid] = ("delete", i, prev_nid) + + # Match: consume seq[i] AND advance from prev_nid to nid. + # Not allowed at START (no node before it) or END (no real tokens). + if i < m and nid not in ("START", "END"): + token = seq[i] + match_s = node_match_score(token, node, pack, mode=node_match_mode) + if match_s != NEG_INF: + for prev_nid in incoming[nid]: + if dp[i][prev_nid] == NEG_INF: + continue + cand = dp[i][prev_nid] + match_s + if cand > dp[i + 1][nid]: + dp[i + 1][nid] = cand + bp[i + 1][nid] = ("match", i, prev_nid) + + # Insertion in seq: consume seq[i] but stay at this node. + # Not allowed at START (we never sit on START while consuming). + if i < m and nid != "START": + if dp[i][nid] != NEG_INF: + cand = dp[i][nid] + gap_penalty + if cand > dp[i + 1][nid]: + dp[i + 1][nid] = cand + bp[i + 1][nid] = ("insert", i, nid) + + best_score = dp[m]["END"] + return {"dp": dp, "bp": bp, "best_score": best_score, "topo": topo} diff --git a/tests/multi/test_merge_dp_run.py b/tests/multi/test_merge_dp_run.py new file mode 100644 index 0000000..9f20adb --- /dev/null +++ b/tests/multi/test_merge_dp_run.py @@ -0,0 +1,41 @@ +"""Tests for the forward DP of align_sequence_to_graph.""" + +import tracealign +from tracealign.align import AlignerConfig +from tracealign.multi.graph import VariantGraph +from tracealign.multi.merge import _run_poa_dp + + +def test_dp_score_for_identical_sequence_against_linear_graph(): + pack = tracealign.get_language("hbo") + seq = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2") + w1 = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + g = VariantGraph.from_sequence("W1", w1) + score = _run_poa_dp( + seq, g, pack, + pairwise_cfg=AlignerConfig(), + node_match_mode="max", + gap_penalty=-2.0, + )["best_score"] + # Two exact matches at +1.0 each, plus zero from sentinels. + assert score >= 2.0 + + +def test_dp_higher_score_when_sequence_matches_existing_graph_path(): + pack = tracealign.get_language("hbo") + aligned = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + unrelated = tracealign.tokenize("aaa bbb", lang="hbo", seq_label="W2") + g = VariantGraph.from_sequence("W1", aligned) + s_match = _run_poa_dp( + aligned, g, pack, + pairwise_cfg=AlignerConfig(), + node_match_mode="max", + gap_penalty=-2.0, + )["best_score"] + s_unrelated = _run_poa_dp( + unrelated, g, pack, + pairwise_cfg=AlignerConfig(), + node_match_mode="max", + gap_penalty=-2.0, + )["best_score"] + assert s_match > s_unrelated From da5f1edbc988f81b9768af3cd51ec3b32d0f5c6d Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 09:52:25 +0200 Subject: [PATCH 15/27] feat(multi): add POA traceback over the backpointer table Walks from (m, END) to (0, START), returning a forward-ordered list of (op, prev_i, prev_node_id, curr_node_id) tuples that the merge step will apply to the variant graph. --- src/tracealign/multi/merge.py | 26 +++++++++++++++++++++++++ tests/multi/test_merge_traceback.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 tests/multi/test_merge_traceback.py diff --git a/src/tracealign/multi/merge.py b/src/tracealign/multi/merge.py index fc398d7..0c12801 100644 --- a/src/tracealign/multi/merge.py +++ b/src/tracealign/multi/merge.py @@ -165,3 +165,29 @@ def _is_sentinel(nid: str) -> bool: best_score = dp[m]["END"] return {"dp": dp, "bp": bp, "best_score": best_score, "topo": topo} + + +def _traceback_ops(dp_result: dict) -> list[tuple[str, int, str, str]]: + """Walk the backpointer table from (m, END) to (0, START). + + Returns a list of ops, each a tuple (op_kind, seq_index, prev_node_id, curr_node_id), + in forward order (from START toward END). `op_kind` is "match", "insert", or "delete". + """ + bp = dp_result["bp"] + dp = dp_result["dp"] + topo = dp_result["topo"] # noqa: F841 — kept for symmetry with dp_result schema + + m = max(dp.keys()) + i = m + cur_nid = "END" + ops_rev: list[tuple[str, int, str, str]] = [] + while not (i == 0 and cur_nid == "START"): + back = bp[i][cur_nid] + if back is None: + # Reached an unreachable state — should not happen for valid input + break + op, prev_i, prev_nid = back + ops_rev.append((op, prev_i, prev_nid, cur_nid)) + i = prev_i + cur_nid = prev_nid + return list(reversed(ops_rev)) diff --git a/tests/multi/test_merge_traceback.py b/tests/multi/test_merge_traceback.py new file mode 100644 index 0000000..e1b8625 --- /dev/null +++ b/tests/multi/test_merge_traceback.py @@ -0,0 +1,30 @@ +"""Tests for the traceback of align_sequence_to_graph.""" + +import tracealign +from tracealign.align import AlignerConfig +from tracealign.multi.graph import VariantGraph +from tracealign.multi.merge import _run_poa_dp, _traceback_ops + + +def test_traceback_for_identical_sequence_yields_only_matches(): + pack = tracealign.get_language("hbo") + seq = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2") + base = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + g = VariantGraph.from_sequence("W1", base) + dpr = _run_poa_dp(seq, g, pack, AlignerConfig(), "max", -2.0) + ops = _traceback_ops(dpr) + # All operations should be matches (one per seq token) + assert len([op for op in ops if op[0] == "match"]) == len(seq) + assert all(op[0] in ("match", "insert", "delete") for op in ops) + + +def test_traceback_for_inserted_token_yields_insert(): + pack = tracealign.get_language("hbo") + # Sequence has one extra token + base = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + extended = tracealign.tokenize("שלום חדש עולם", lang="hbo", seq_label="W2") + g = VariantGraph.from_sequence("W1", base) + dpr = _run_poa_dp(extended, g, pack, AlignerConfig(), "max", -2.0) + ops = _traceback_ops(dpr) + # At least one insert op for the extra token + assert any(op[0] == "insert" for op in ops) From ca377dc81f8940ced233e0fdb265bb7ea0cbfa2e Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 10:01:25 +0200 Subject: [PATCH 16/27] feat(multi): add align_sequence_to_graph end-to-end merge Combines _run_poa_dp + _traceback_ops + the actual graph mutation: matches grow existing nodes, insertions add new nodes, deletions cause the new witness's edge to bypass nodes. Re-numbers node ids by the final topological sort so callers always work with stable n:NNNNNN identifiers. --- src/tracealign/multi/merge.py | 148 +++++++++++++++++++++++++++++++- tests/multi/test_merge_apply.py | 40 +++++++++ 2 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 tests/multi/test_merge_apply.py diff --git a/src/tracealign/multi/merge.py b/src/tracealign/multi/merge.py index 0c12801..3e6b0f9 100644 --- a/src/tracealign/multi/merge.py +++ b/src/tracealign/multi/merge.py @@ -8,7 +8,7 @@ from tracealign.align.needleman_wunsch import _dp_score from tracealign.lang.base import LanguagePack from tracealign.model import Token -from tracealign.multi.graph import GraphNode +from tracealign.multi.graph import GraphEdge, GraphNode, VariantGraph from tracealign.score.tiered import tiered_score @@ -191,3 +191,149 @@ def _traceback_ops(dp_result: dict) -> list[tuple[str, int, str, str]]: i = prev_i cur_nid = prev_nid return list(reversed(ops_rev)) + + +def align_sequence_to_graph( + seq: list[Token], + witness_id: str, + graph: VariantGraph, + pack: LanguagePack, + pairwise_cfg: AlignerConfig, + node_match_mode: str, + gap_penalty: float, +) -> VariantGraph: + """Align a new witness sequence against an existing graph and merge. + + Returns a new VariantGraph that includes the existing graph plus the new + witness's trail through it (matches grow existing nodes, insertions add + new nodes, deletions cause the new witness's edge to bypass nodes). + """ + dpr = _run_poa_dp(seq, graph, pack, pairwise_cfg, node_match_mode, gap_penalty) + ops = _traceback_ops(dpr) + + nodes_by_id = {n.id: n for n in graph.nodes} + edges = list(graph.edges) + nodes = list(graph.nodes) + + # Track the last node id the new witness has reached + last_node_id = "START" + inserted_count = 0 + + def _new_node_id(count: int) -> str: + # Use a stable scheme keyed off insertion order; will be renumbered + # after the final topological sort. + return f"new:{count:06d}" + + for op, _prev_i, _prev_nid, curr_nid in ops: + if op == "match": + # Merge seq[_prev_i] into nodes_by_id[curr_nid].tokens[witness_id]. + seq_idx_consumed = _prev_i + target_node = nodes_by_id[curr_nid] + new_tokens = dict(target_node.tokens) + new_tokens[witness_id] = seq[seq_idx_consumed] + updated = GraphNode(id=target_node.id, tokens=new_tokens) + for k, n in enumerate(nodes): + if n.id == target_node.id: + nodes[k] = updated + break + nodes_by_id[target_node.id] = updated + # Edge from last_node_id to curr_nid: add witness_id + edges = _add_witness_to_edge(edges, last_node_id, curr_nid, witness_id) + last_node_id = curr_nid + + elif op == "insert": + # Create a new node holding only seq[_prev_i] under witness_id. + seq_idx = _prev_i + new_id = _new_node_id(inserted_count) + inserted_count += 1 + new_node = GraphNode(id=new_id, tokens={witness_id: seq[seq_idx]}) + nodes.append(new_node) + nodes_by_id[new_id] = new_node + # Edge from last_node_id to new_id + edges.append( + GraphEdge( + source_id=last_node_id, + target_id=new_id, + witnesses={witness_id}, + ) + ) + last_node_id = new_id + + elif op == "delete": + # The new witness skips curr_nid; on the next op we'll add the + # edge directly from last_node_id to the next node touched. + pass + + # Final edge to END + edges = _add_witness_to_edge(edges, last_node_id, "END", witness_id) + + # Renumber via topological sort to get stable ids + new_witness_ids = sorted(set(graph.witness_ids) | {witness_id}) + intermediate = VariantGraph(nodes=nodes, edges=edges, witness_ids=new_witness_ids) + return _renumber_topologically(intermediate) + + +def _add_witness_to_edge( + edges: list[GraphEdge], + source_id: str, + target_id: str, + witness_id: str, +) -> list[GraphEdge]: + """Add `witness_id` to an existing edge or create a new one with just it.""" + out: list[GraphEdge] = [] + found = False + for e in edges: + if e.source_id == source_id and e.target_id == target_id: + out.append( + GraphEdge( + source_id=source_id, + target_id=target_id, + witnesses=e.witnesses | {witness_id}, + ) + ) + found = True + else: + out.append(e) + if not found: + out.append( + GraphEdge( + source_id=source_id, + target_id=target_id, + witnesses={witness_id}, + ) + ) + return out + + +def _renumber_topologically(graph: VariantGraph) -> VariantGraph: + """Re-assign stable node ids ``n:NNNNNN`` based on the topological order.""" + topo = _topological_node_ids(graph) + id_map: dict[str, str] = {} + content_index = 0 + for old_id in topo: + if old_id == "START": + id_map[old_id] = "START" + elif old_id == "END": + id_map[old_id] = "END" + else: + id_map[old_id] = f"n:{content_index:06d}" + content_index += 1 + + nodes_by_old_id = {n.id: n for n in graph.nodes} + new_nodes = [ + GraphNode(id=id_map[old_id], tokens=nodes_by_old_id[old_id].tokens) + for old_id in topo + ] + new_edges = [ + GraphEdge( + source_id=id_map[e.source_id], + target_id=id_map[e.target_id], + witnesses=e.witnesses, + ) + for e in graph.edges + ] + return VariantGraph( + nodes=new_nodes, + edges=new_edges, + witness_ids=graph.witness_ids, + ) diff --git a/tests/multi/test_merge_apply.py b/tests/multi/test_merge_apply.py new file mode 100644 index 0000000..0d6305f --- /dev/null +++ b/tests/multi/test_merge_apply.py @@ -0,0 +1,40 @@ +"""Tests for align_sequence_to_graph as a complete operation.""" + +import tracealign +from tracealign.align import AlignerConfig +from tracealign.multi.graph import VariantGraph +from tracealign.multi.merge import align_sequence_to_graph + + +def test_align_identical_sequence_merges_all_tokens(): + pack = tracealign.get_language("hbo") + a = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + b = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2") + g = VariantGraph.from_sequence("W1", a) + g2 = align_sequence_to_graph(b, "W2", g, pack, AlignerConfig(), "max", -2.0) + + # Both witnesses present + assert set(g2.witness_ids) == {"W1", "W2"} + # All non-sentinel nodes carry both witnesses + for node in g2.nodes: + if node.id in ("START", "END"): + continue + assert set(node.tokens.keys()) == {"W1", "W2"} + # No new nodes introduced beyond the original (graph still has same content nodes) + assert len([n for n in g2.nodes if n.id not in ("START", "END")]) == len(a) + + +def test_align_with_insertion_creates_new_node(): + pack = tracealign.get_language("hbo") + base = tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1") + extended = tracealign.tokenize("שלום חדש עולם", lang="hbo", seq_label="W2") + g = VariantGraph.from_sequence("W1", base) + g2 = align_sequence_to_graph(extended, "W2", g, pack, AlignerConfig(), "max", -2.0) + + # Now there are 3 content nodes (the new "חדש" added) + content_nodes = [n for n in g2.nodes if n.id not in ("START", "END")] + assert len(content_nodes) == 3 + # The new node carries only W2 + inserted = [n for n in content_nodes if set(n.tokens.keys()) == {"W2"}] + assert len(inserted) == 1 + assert inserted[0].tokens["W2"].text == "חדש" From 3d05e7a183fe6011698e9686a2b9ed566f2252c2 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 10:04:58 +0200 Subject: [PATCH 17/27] feat(multi): add progressive_merge wrapper Walks the guide tree in post-order and incrementally merges each witness into the variant graph via align_sequence_to_graph. The first witness seeds the graph as a linear chain; subsequent witnesses extend it. --- src/tracealign/multi/merge.py | 27 +++++++++++++++ tests/multi/test_progressive_merge.py | 47 +++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 tests/multi/test_progressive_merge.py diff --git a/src/tracealign/multi/merge.py b/src/tracealign/multi/merge.py index 3e6b0f9..30efc0a 100644 --- a/src/tracealign/multi/merge.py +++ b/src/tracealign/multi/merge.py @@ -337,3 +337,30 @@ def _renumber_topologically(graph: VariantGraph) -> VariantGraph: edges=new_edges, witness_ids=graph.witness_ids, ) + + +from tracealign.multi.guide_tree import GuideTree, post_order_witness_ids # noqa: E402 + + +def progressive_merge( + witnesses: dict[str, list[Token]], + tree: GuideTree, + pack: LanguagePack, + pairwise_cfg: AlignerConfig, + node_match_mode: str = "max", + gap_penalty: float = -2.0, +) -> VariantGraph: + """Merge all witnesses into one variant graph in canonical tree-order.""" + order = post_order_witness_ids(tree) + if not order: + return VariantGraph(nodes=[], edges=[], witness_ids=[]) + + # Initialise with the first witness as a linear chain + g = VariantGraph.from_sequence(order[0], witnesses[order[0]]) + + for wid in order[1:]: + g = align_sequence_to_graph( + witnesses[wid], wid, g, pack, pairwise_cfg, node_match_mode, gap_penalty + ) + + return g diff --git a/tests/multi/test_progressive_merge.py b/tests/multi/test_progressive_merge.py new file mode 100644 index 0000000..005dad3 --- /dev/null +++ b/tests/multi/test_progressive_merge.py @@ -0,0 +1,47 @@ +"""Tests for progressive_merge wrapper.""" + +import numpy as np + +import tracealign +from tracealign.align import AlignerConfig +from tracealign.multi.guide_tree import build_upgma +from tracealign.multi.merge import progressive_merge + + +def test_progressive_merge_three_identical_witnesses_collapses(): + pack = tracealign.get_language("hbo") + seqs = { + "W1": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2"), + "W3": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W3"), + } + D = np.zeros((3, 3)) + tree = build_upgma(D, ["W1", "W2", "W3"]) + g = progressive_merge(seqs, tree, pack, AlignerConfig(), "max", -2.0) + # All content nodes should carry all three witnesses + for n in g.nodes: + if n.id in ("START", "END"): + continue + assert set(n.tokens.keys()) == {"W1", "W2", "W3"} + + +def test_progressive_merge_with_distinct_third_witness(): + pack = tracealign.get_language("hbo") + seqs = { + "W1": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2"), + "W3": tracealign.tokenize("שלום עולם חדש", lang="hbo", seq_label="W3"), + } + D = np.array([ + [0.0, 0.0, 0.3], + [0.0, 0.0, 0.3], + [0.3, 0.3, 0.0], + ]) + tree = build_upgma(D, ["W1", "W2", "W3"]) + g = progressive_merge(seqs, tree, pack, AlignerConfig(), "max", -2.0) + # All witnesses present + assert set(g.witness_ids) == {"W1", "W2", "W3"} + # There's at least one node holding only W3 (the inserted "חדש") + w3_only = [n for n in g.nodes + if n.id not in ("START", "END") and set(n.tokens.keys()) == {"W3"}] + assert len(w3_only) >= 1 From 8655126b69bd5809cf977600f51672ab83fadb22 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 10:24:02 +0200 Subject: [PATCH 18/27] feat(multi): add MultiAlignerConfig dataclass Mirrors v0.1's AlignerConfig style. pairwise is nested for the Phase 1 distance calculations; node_match defaults to 'max' with 'mean' and 'min' available; guide_tree_method is 'upgma' in v0.2. _validate_config rejects unknown enum values at the API boundary. --- src/tracealign/multi/api.py | 38 ++++++++++++++++++++++ tests/multi/test_multi_aligner_config.py | 40 ++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 src/tracealign/multi/api.py create mode 100644 tests/multi/test_multi_aligner_config.py diff --git a/src/tracealign/multi/api.py b/src/tracealign/multi/api.py new file mode 100644 index 0000000..b13511f --- /dev/null +++ b/src/tracealign/multi/api.py @@ -0,0 +1,38 @@ +"""Public entry point for multi-witness alignment.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from tracealign.align import AlignerConfig + + +@dataclass +class MultiAlignerConfig: + """Configuration for align_multi. + + Follows the @dataclass style of v0.1's AlignerConfig; snapshotted into + MultiAlignmentResult.params for persistence. + """ + + pairwise: AlignerConfig = field(default_factory=AlignerConfig) + node_match: str = "max" + guide_tree_method: str = "upgma" + gap_penalty_multi: float = -2.0 + + +_VALID_NODE_MATCH = {"max", "mean", "min"} +_VALID_GUIDE_TREE = {"upgma"} + + +def _validate_config(cfg: MultiAlignerConfig) -> None: + if cfg.node_match not in _VALID_NODE_MATCH: + raise ValueError( + f"unknown node_match mode: {cfg.node_match!r}; " + f"expected one of {sorted(_VALID_NODE_MATCH)}" + ) + if cfg.guide_tree_method not in _VALID_GUIDE_TREE: + raise ValueError( + f"unknown guide_tree_method: {cfg.guide_tree_method!r}; " + f"expected one of {sorted(_VALID_GUIDE_TREE)}" + ) diff --git a/tests/multi/test_multi_aligner_config.py b/tests/multi/test_multi_aligner_config.py new file mode 100644 index 0000000..8d6aa6f --- /dev/null +++ b/tests/multi/test_multi_aligner_config.py @@ -0,0 +1,40 @@ +"""Tests for MultiAlignerConfig.""" + +import pytest + +from tracealign.align import AlignerConfig +from tracealign.multi.api import MultiAlignerConfig + + +def test_default_multi_config(): + cfg = MultiAlignerConfig() + assert isinstance(cfg.pairwise, AlignerConfig) + assert cfg.node_match == "max" + assert cfg.guide_tree_method == "upgma" + assert cfg.gap_penalty_multi == -2.0 + + +def test_override_node_match(): + cfg = MultiAlignerConfig(node_match="mean") + assert cfg.node_match == "mean" + + +def test_pairwise_config_is_distinct_per_instance(): + c1 = MultiAlignerConfig() + c2 = MultiAlignerConfig() + assert c1.pairwise is not c2.pairwise + + +def test_validate_known_node_match_values_only(): + # Direct field assignment to an unknown value is not validated at + # dataclass-construction time; validation happens at align_multi entry. + # But we provide a helper that does the check. + from tracealign.multi.api import _validate_config + + _validate_config(MultiAlignerConfig(node_match="max")) + _validate_config(MultiAlignerConfig(node_match="mean")) + _validate_config(MultiAlignerConfig(node_match="min")) + with pytest.raises(ValueError): + _validate_config(MultiAlignerConfig(node_match="median")) + with pytest.raises(ValueError): + _validate_config(MultiAlignerConfig(guide_tree_method="neighbor_joining")) From 6576bf3855c0ea5c13d7b130b018643f1ff48b36 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 12:11:28 +0200 Subject: [PATCH 19/27] feat(multi): add MultiAlignmentResult and align_multi() End-to-end pipeline: pairwise_distances -> build_upgma -> progressive_merge, then derive an AlignedTable from the graph. params snapshot carries trace_version, language_pack_version, and the full config. summary is empty in v0.2.0 and can be enriched in v0.2.x without an API break. --- src/tracealign/__init__.py | 2 + src/tracealign/multi/api.py | 93 +++++++++++++++++++++++++++++++++ tests/multi/test_align_multi.py | 49 +++++++++++++++++ 3 files changed, 144 insertions(+) create mode 100644 tests/multi/test_align_multi.py diff --git a/src/tracealign/__init__.py b/src/tracealign/__init__.py index 416bd8c..bd332cd 100644 --- a/src/tracealign/__init__.py +++ b/src/tracealign/__init__.py @@ -16,6 +16,7 @@ register_language, ) from tracealign.model import AlignmentResult, Lexica, Match, Reason, Token +from tracealign.multi.api import align_multi from tracealign.tokenize.base import ( DEFAULT_EDITORIAL_RULES, EditorialBracketRules, @@ -129,6 +130,7 @@ def align( "UnknownLanguageError", "__version__", "align", + "align_multi", "get_language", "list_languages", "register_language", diff --git a/src/tracealign/multi/api.py b/src/tracealign/multi/api.py index b13511f..c18eb1f 100644 --- a/src/tracealign/multi/api.py +++ b/src/tracealign/multi/api.py @@ -3,8 +3,19 @@ from __future__ import annotations from dataclasses import dataclass, field +from typing import Any +from pydantic import BaseModel, ConfigDict + +import tracealign as _tracealign_pkg from tracealign.align import AlignerConfig +from tracealign.lang.base import LanguagePack +from tracealign.model import Reason, Token +from tracealign.multi.distance import pairwise_distances +from tracealign.multi.graph import VariantGraph +from tracealign.multi.guide_tree import GuideTree, build_upgma +from tracealign.multi.merge import progressive_merge +from tracealign.multi.table import AlignedTable, TableCell, TableColumn @dataclass @@ -36,3 +47,85 @@ def _validate_config(cfg: MultiAlignerConfig) -> None: f"unknown guide_tree_method: {cfg.guide_tree_method!r}; " f"expected one of {sorted(_VALID_GUIDE_TREE)}" ) + + +class MultiAlignmentResult(BaseModel): + """Top-level result of align_multi().""" + + model_config = ConfigDict(extra="forbid") + + graph: VariantGraph + table: AlignedTable + guide_tree: GuideTree + witness_ids: list[str] + summary: dict[Reason, int] + params: dict[str, Any] + + +def align_multi( + witnesses: dict[str, list[Token]], + lang: str | LanguagePack = "hbo", + config: MultiAlignerConfig | None = None, +) -> MultiAlignmentResult: + """Align N witnesses simultaneously, producing a variant graph + aligned table.""" + cfg = config or MultiAlignerConfig() + _validate_config(cfg) + + pack = _tracealign_pkg.get_language(lang) + + # Phase 1: distance matrix + D, witness_ids = pairwise_distances(witnesses, pack, cfg.pairwise) + + # Phase 2: guide tree + if cfg.guide_tree_method == "upgma": + tree = build_upgma(D, witness_ids) + else: + raise ValueError(f"unsupported guide_tree_method: {cfg.guide_tree_method}") + + # Phase 3: progressive merge + graph = progressive_merge( + witnesses, tree, pack, cfg.pairwise, cfg.node_match, cfg.gap_penalty_multi + ) + + # Derive table view (default-anchored to first witness in tree post-order) + table = _build_table_from_graph(graph) + + # Aggregate per-pair summaries into a single counter. + # In v0.2 we keep this simple: empty summary unless we want to expose it + # later. (Acceptable per spec — summary may be empty in v0.2.0; richer + # aggregation can come in v0.2.x without an API break.) + summary: dict[Reason, int] = {} + + params: dict[str, Any] = { + "lang": pack.code, + "gap_open": cfg.pairwise.gap_open, + "gap_extend": cfg.pairwise.gap_extend, + "gap_penalty_multi": cfg.gap_penalty_multi, + "node_match": cfg.node_match, + "guide_tree_method": cfg.guide_tree_method, + "trace_version": getattr(_tracealign_pkg, "__version__", "0.0.0"), + "language_pack_version": getattr(pack, "version", "unknown"), + } + + return MultiAlignmentResult( + graph=graph, + table=table, + guide_tree=tree, + witness_ids=witness_ids, + summary=summary, + params=params, + ) + + +def _build_table_from_graph(graph: VariantGraph) -> AlignedTable: + """Derive an AlignedTable by walking the topological order of the graph.""" + columns: list[TableColumn] = [] + for node in graph.nodes: + if node.id in ("START", "END"): + continue + cells: dict[str, TableCell] = {} + for wid in graph.witness_ids: + tok = node.tokens.get(wid) + cells[wid] = TableCell(token=tok, node_id=node.id if tok else None) + columns.append(TableColumn(cells=cells)) + return AlignedTable(witnesses=list(graph.witness_ids), columns=columns) diff --git a/tests/multi/test_align_multi.py b/tests/multi/test_align_multi.py new file mode 100644 index 0000000..a231c50 --- /dev/null +++ b/tests/multi/test_align_multi.py @@ -0,0 +1,49 @@ +"""End-to-end tests for align_multi.""" + +import pytest + +import tracealign +from tracealign.multi.api import MultiAlignmentResult, align_multi + + +def test_align_multi_single_witness(): + seqs = {"W1": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1")} + result = align_multi(seqs, lang="hbo") + assert isinstance(result, MultiAlignmentResult) + assert result.witness_ids == ["W1"] + assert len(result.graph.nodes) >= 2 # at least START and END + + +def test_align_multi_two_identical_witnesses(): + seqs = { + "W1": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2"), + } + result = align_multi(seqs, lang="hbo") + assert set(result.witness_ids) == {"W1", "W2"} + # Every content node carries both witnesses + for n in result.graph.nodes: + if n.id in ("START", "END"): + continue + assert set(n.tokens.keys()) == {"W1", "W2"} + + +def test_align_multi_params_carry_version_metadata(): + seqs = {"W1": tracealign.tokenize("שלום", lang="hbo", seq_label="W1")} + result = align_multi(seqs, lang="hbo") + assert "trace_version" in result.params + assert "language_pack_version" in result.params + assert "guide_tree_method" in result.params + assert result.params["lang"] == "hbo" + + +def test_align_multi_rejects_unknown_node_match(): + from tracealign.multi.api import MultiAlignerConfig + + seqs = { + "W1": tracealign.tokenize("שלום", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום", lang="hbo", seq_label="W2"), + } + bad = MultiAlignerConfig(node_match="median") + with pytest.raises(ValueError): + align_multi(seqs, lang="hbo", config=bad) From a67e426a64685d66c4a984820b4a3b541588c1e3 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 16:00:00 +0200 Subject: [PATCH 20/27] feat(api): re-export multi-witness types at top level align_multi, MultiAlignerConfig, MultiAlignmentResult, VariantGraph, GraphNode, GraphEdge, AlignedTable, TableColumn, TableCell, GuideTree, GuideTreeNode are now reachable as tracealign.. --- src/tracealign/__init__.py | 19 ++++++++++++++++++- tests/multi/test_public_reexports.py | 27 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 tests/multi/test_public_reexports.py diff --git a/src/tracealign/__init__.py b/src/tracealign/__init__.py index bd332cd..479cd60 100644 --- a/src/tracealign/__init__.py +++ b/src/tracealign/__init__.py @@ -16,7 +16,14 @@ register_language, ) from tracealign.model import AlignmentResult, Lexica, Match, Reason, Token -from tracealign.multi.api import align_multi +from tracealign.multi.api import ( + MultiAlignerConfig, + MultiAlignmentResult, + align_multi, +) +from tracealign.multi.graph import GraphEdge, GraphNode, VariantGraph +from tracealign.multi.guide_tree import GuideTree, GuideTreeNode +from tracealign.multi.table import AlignedTable, TableCell, TableColumn from tracealign.tokenize.base import ( DEFAULT_EDITORIAL_RULES, EditorialBracketRules, @@ -117,17 +124,27 @@ def align( __all__ = [ + "AlignedTable", "AlignerConfig", "AlignmentResult", "DEFAULT_EDITORIAL_RULES", "EditorialBracketRules", + "GraphEdge", + "GraphNode", + "GuideTree", + "GuideTreeNode", "LanguagePack", "Lexica", "Match", + "MultiAlignerConfig", + "MultiAlignmentResult", "RawToken", "Reason", + "TableCell", + "TableColumn", "Token", "UnknownLanguageError", + "VariantGraph", "__version__", "align", "align_multi", diff --git a/tests/multi/test_public_reexports.py b/tests/multi/test_public_reexports.py new file mode 100644 index 0000000..c551068 --- /dev/null +++ b/tests/multi/test_public_reexports.py @@ -0,0 +1,27 @@ +"""Tests that align_multi and the multi types are reachable at the top level.""" + + +def test_top_level_align_multi(): + import tracealign + + assert hasattr(tracealign, "align_multi") + assert hasattr(tracealign, "MultiAlignerConfig") + assert hasattr(tracealign, "MultiAlignmentResult") + + +def test_top_level_graph_types(): + import tracealign + + assert hasattr(tracealign, "VariantGraph") + assert hasattr(tracealign, "GraphNode") + assert hasattr(tracealign, "GraphEdge") + + +def test_top_level_table_and_guide_tree(): + import tracealign + + assert hasattr(tracealign, "AlignedTable") + assert hasattr(tracealign, "TableColumn") + assert hasattr(tracealign, "TableCell") + assert hasattr(tracealign, "GuideTree") + assert hasattr(tracealign, "GuideTreeNode") From 5d00806bc2763bc95f7f3b628b5e66a8929521e2 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 17:26:41 +0200 Subject: [PATCH 21/27] feat(io): add MultiAlignmentResult JSON dump/load Dedicated module separate from io/result.py for v0.1's AlignmentResult. dumps/loads for string round-trips, dump/load for file round-trips. Round-trip preserves the guide tree's distance matrix end-to-end. --- src/tracealign/io/multi_result.py | 23 +++++++++++++++ tests/io/test_multi_result_json.py | 47 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 src/tracealign/io/multi_result.py create mode 100644 tests/io/test_multi_result_json.py diff --git a/src/tracealign/io/multi_result.py b/src/tracealign/io/multi_result.py new file mode 100644 index 0000000..a7304b3 --- /dev/null +++ b/src/tracealign/io/multi_result.py @@ -0,0 +1,23 @@ +"""JSON persistence for MultiAlignmentResult.""" + +from __future__ import annotations + +from pathlib import Path + +from tracealign.multi.api import MultiAlignmentResult + + +def dumps(result: MultiAlignmentResult) -> str: + return result.model_dump_json() + + +def loads(payload: str) -> MultiAlignmentResult: + return MultiAlignmentResult.model_validate_json(payload) + + +def dump(result: MultiAlignmentResult, path: Path | str) -> None: + Path(path).write_text(dumps(result), encoding="utf-8") + + +def load(path: Path | str) -> MultiAlignmentResult: + return loads(Path(path).read_text(encoding="utf-8")) diff --git a/tests/io/test_multi_result_json.py b/tests/io/test_multi_result_json.py new file mode 100644 index 0000000..f977cf2 --- /dev/null +++ b/tests/io/test_multi_result_json.py @@ -0,0 +1,47 @@ +"""Tests for MultiAlignmentResult JSON persistence.""" + +import json +from pathlib import Path + +import tracealign +from tracealign.io import multi_result as mr_io + + +def _build_result(): + seqs = { + "W1": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2"), + } + return tracealign.align_multi(seqs, lang="hbo") + + +def test_dumps_loads_round_trip(): + r = _build_result() + s = mr_io.dumps(r) + restored = mr_io.loads(s) + assert set(restored.witness_ids) == set(r.witness_ids) + assert restored.params["lang"] == "hbo" + + +def test_dumps_is_valid_json(): + r = _build_result() + s = mr_io.dumps(r) + parsed = json.loads(s) + assert "graph" in parsed + assert "table" in parsed + assert "guide_tree" in parsed + + +def test_dump_load_file_round_trip(tmp_path: Path): + r = _build_result() + f = tmp_path / "multi.json" + mr_io.dump(r, f) + restored = mr_io.load(f) + assert restored.witness_ids == r.witness_ids + + +def test_round_trip_preserves_distance_matrix(): + r = _build_result() + s = mr_io.dumps(r) + restored = mr_io.loads(s) + assert restored.guide_tree.distance_matrix == r.guide_tree.distance_matrix From a8b8771c1016278718c38a9d86eed4c08964fea6 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 17:33:14 +0200 Subject: [PATCH 22/27] test(multi): pin lossless reconstruction property For every witness w in the input, the path through the result graph must yield exactly the original token sequence. This is the v0.2 correctness guarantee against information loss during merging. --- tests/multi/test_lossless_reconstruction.py | 40 +++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/multi/test_lossless_reconstruction.py diff --git a/tests/multi/test_lossless_reconstruction.py b/tests/multi/test_lossless_reconstruction.py new file mode 100644 index 0000000..66df679 --- /dev/null +++ b/tests/multi/test_lossless_reconstruction.py @@ -0,0 +1,40 @@ +"""Property test: every input witness can be exactly reconstructed from the result graph.""" + +import tracealign + + +def _reconstruct(result, wid: str): + """Walk the witness's path through the graph and return its tokens.""" + return [n.tokens[wid] for n in result.graph.witness_path(wid)] + + +def test_lossless_reconstruction_two_witnesses(): + seqs = { + "W1": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2"), + } + result = tracealign.align_multi(seqs, lang="hbo") + for wid in seqs: + assert _reconstruct(result, wid) == seqs[wid] + + +def test_lossless_reconstruction_three_witnesses_with_insertion(): + seqs = { + "W1": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="W2"), + "W3": tracealign.tokenize("שלום חדש עולם", lang="hbo", seq_label="W3"), + } + result = tracealign.align_multi(seqs, lang="hbo") + for wid in seqs: + assert _reconstruct(result, wid) == seqs[wid] + + +def test_lossless_reconstruction_diverse_witnesses(): + seqs = { + "A": tracealign.tokenize("שלום עולם רַבִּי דויד אמר", lang="hbo", seq_label="A"), + "B": tracealign.tokenize("שלום עולם רבי דוד אמר", lang="hbo", seq_label="B"), + "C": tracealign.tokenize("שלום עולם ר\"י אמר", lang="hbo", seq_label="C"), + } + result = tracealign.align_multi(seqs, lang="hbo") + for wid in seqs: + assert _reconstruct(result, wid) == seqs[wid] From 275276bd7c7f9fe56d8e53a59efcb19623978bdc Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 17:45:24 +0200 Subject: [PATCH 23/27] test(multi): pin permutation invariance property The same set of witnesses in different input dict insertion order must yield the same alignment (same witness paths, same variant loci). This justifies the guide tree as the algorithmic foundation of v0.2. --- tests/multi/test_permutation_invariance.py | 52 ++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 tests/multi/test_permutation_invariance.py diff --git a/tests/multi/test_permutation_invariance.py b/tests/multi/test_permutation_invariance.py new file mode 100644 index 0000000..e7b4c35 --- /dev/null +++ b/tests/multi/test_permutation_invariance.py @@ -0,0 +1,52 @@ +"""Property test: align_multi result must not depend on input dict insertion order.""" + +import tracealign + + +def _witness_paths(result): + return { + wid: [n.tokens[wid].text for n in result.graph.witness_path(wid)] + for wid in result.witness_ids + } + + +def _variant_loci(result): + return frozenset( + frozenset((wid, t.text) for wid, t in n.tokens.items()) + for n in result.graph.variants() + ) + + +def test_permutation_invariance_two_orderings(): + seqs_a = { + "W1": tracealign.tokenize("שלום עולם רַבִּי דויד", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם רבי דוד", lang="hbo", seq_label="W2"), + "W3": tracealign.tokenize("שלום עולם ר\"י", lang="hbo", seq_label="W3"), + } + # Different insertion order, same data + seqs_b = { + "W3": seqs_a["W3"], + "W1": seqs_a["W1"], + "W2": seqs_a["W2"], + } + + r_a = tracealign.align_multi(seqs_a, lang="hbo") + r_b = tracealign.align_multi(seqs_b, lang="hbo") + + assert _witness_paths(r_a) == _witness_paths(r_b) + assert _variant_loci(r_a) == _variant_loci(r_b) + + +def test_permutation_invariance_with_insertion(): + seqs_a = { + "A": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="A"), + "B": tracealign.tokenize("שלום עולם", lang="hbo", seq_label="B"), + "C": tracealign.tokenize("שלום חדש עולם", lang="hbo", seq_label="C"), + } + seqs_b = {"C": seqs_a["C"], "A": seqs_a["A"], "B": seqs_a["B"]} + + r_a = tracealign.align_multi(seqs_a, lang="hbo") + r_b = tracealign.align_multi(seqs_b, lang="hbo") + + assert _witness_paths(r_a) == _witness_paths(r_b) + assert _variant_loci(r_a) == _variant_loci(r_b) From 773ab3881fc9507724938720ed220abf153df002 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 18:48:10 +0200 Subject: [PATCH 24/27] test(e2e): add synthetic Hebrew multi-witness golden test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four constructed witnesses exercising agreement (שלום עולם), niqqud stripping (רַבִּי vs רבי), plene/defective (דויד vs דוד), abbreviation (ר"י vs רבי), and insertion (טוב in W4 only). Pins the lossless reconstruction property and that the W4 insertion surfaces as a column in the re-anchored AlignedTable. --- tests/e2e/test_synthetic_hebrew_multi.py | 62 ++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 tests/e2e/test_synthetic_hebrew_multi.py diff --git a/tests/e2e/test_synthetic_hebrew_multi.py b/tests/e2e/test_synthetic_hebrew_multi.py new file mode 100644 index 0000000..5fee528 --- /dev/null +++ b/tests/e2e/test_synthetic_hebrew_multi.py @@ -0,0 +1,62 @@ +"""End-to-end multi-witness golden test on synthetic Hebrew.""" + +import tracealign + + +WITNESS_TEXTS = { + "W1": "שלום עולם רַבִּי דויד אמר מחר", + "W2": "שלום עולם רבי דוד אמר מחר", + "W3": "שלום עולם ר\"י אמר מחר", + "W4": "שלום עולם רבי דוד אמר טוב מחר", +} + + +def _align_witnesses(): + return tracealign.align_multi( + { + wid: tracealign.tokenize(text, lang="hbo", seq_label=wid) + for wid, text in WITNESS_TEXTS.items() + }, + lang="hbo", + ) + + +def test_synthetic_multi_all_witnesses_present(): + result = _align_witnesses() + assert set(result.witness_ids) == {"W1", "W2", "W3", "W4"} + + +def test_synthetic_multi_lossless_reconstruction(): + result = _align_witnesses() + expected = { + wid: [t.text for t in tracealign.tokenize(text, lang="hbo", seq_label=wid)] + for wid, text in WITNESS_TEXTS.items() + } + for wid, expected_texts in expected.items(): + path_texts = [n.tokens[wid].text for n in result.graph.witness_path(wid)] + assert path_texts == expected_texts + + +def test_synthetic_multi_has_at_least_one_variant_locus(): + result = _align_witnesses() + variants = list(result.graph.variants()) + assert len(variants) >= 1 + + +def test_synthetic_multi_w4_insertion_appears_in_table(): + result = _align_witnesses() + re = result.table.re_anchor("W4") + # Some column shows token "טוב" for W4 and a gap for the others + found_tov = False + for col in re.columns: + cell_w4 = col.cells.get("W4") + if cell_w4 and cell_w4.token and cell_w4.token.text == "טוב": + others_gap = all( + col.cells[wid].token is None + for wid in ("W1", "W2", "W3") + if wid in col.cells + ) + if others_gap: + found_tov = True + break + assert found_tov From a58d75f76610ca787ce87dd2d01fbbe461549370 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 19:02:11 +0200 Subject: [PATCH 25/27] docs(usage): add multi-witness alignment section Covers the align_multi entry point, MultiAlignmentResult fields, MultiAlignerConfig, and JSON persistence via io/multi_result. --- docs/usage.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 26e98a5..0077413 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -61,6 +61,66 @@ for m in result.matches: `m.details` carries Reason-specific extra information. For ABBREVIATION matches that's `role: "primary"` or `"continuation"`, the `expansion` string (e.g. `"רבי ישמעאל"`), and `span_size`. For ORTHOGRAPHIC matches it's the rapidfuzz ratio. +## Multi-witness alignment + +`tracealign.align_multi(witnesses, lang, config=None)` aligns N witness sequences simultaneously and returns a canonical variant graph plus a derived aligned table. + +```python +import tracealign + +witnesses = { + "W1": tracealign.tokenize("שלום עולם רבי דוד אמר", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם רבי דוד אמר", lang="hbo", seq_label="W2"), + "W3": tracealign.tokenize("שלום עולם ר\"י אמר", lang="hbo", seq_label="W3"), +} + +result = tracealign.align_multi(witnesses, lang="hbo") + +print(result.guide_tree.format_text()) +print(result.table.format_text()) + +for node in result.graph.variants(): + readings = {wid: t.text for wid, t in node.tokens.items()} + print(node.id, readings) +``` + +The result exposes: + +| Attribute | Description | +|---|---| +| `result.graph` | The canonical `VariantGraph` (DAG). Use `witness_path(w)` to get one witness's trail; `variants()` to iterate variant loci. | +| `result.table` | The derived `AlignedTable`. Use `re_anchor(witness_id)` to render with any witness as the reference column. | +| `result.guide_tree` | The UPGMA `GuideTree`. Carries the original distance matrix for downstream use. | +| `result.witness_ids` | List of witness ids, sorted lexicographically. | +| `result.summary` | Aggregated Reason counts (may be empty in 0.2.0; richer aggregation in later patches). | +| `result.params` | Configuration snapshot plus `trace_version` and `language_pack_version`. | + +### Configuration + +```python +from tracealign import MultiAlignerConfig +from tracealign.align import AlignerConfig + +cfg = MultiAlignerConfig( + pairwise=AlignerConfig(gap_open=-2.5), + node_match="max", # also "mean" or "min" + guide_tree_method="upgma", + gap_penalty_multi=-2.0, +) +result = tracealign.align_multi(witnesses, lang="hbo", config=cfg) +``` + +### Persistence + +```python +from tracealign.io import multi_result as mr_io + +mr_io.dump(result, "alignment.json") +restored = mr_io.load("alignment.json") +``` + +JSON round-trip preserves the entire result including the guide tree's distance matrix. + ## I/O ### JSON round-trip From 4ab6b0233ee79329e61bc4ac7c321d75a5309a18 Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 19:05:47 +0200 Subject: [PATCH 26/27] docs: add multi-witness algorithm details and update roadmap Adds a v0.2 algorithm details section to details.md describing the three-phase pipeline (pairwise distances, UPGMA guide tree, POA-based progressive merge) and the two correctness properties. Marks stage 2 in ROADMAP.md as in-progress on feature/v0.2-multi-witness. --- docs/ROADMAP.md | 2 +- docs/details.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index e88ac5a..f118d70 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -34,7 +34,7 @@ The full ambition spans ten stages, each its own brainstorm → spec → plan | # | Stage | Capability it unlocks | Status | |---|---|---|---| | 1 | **Pairwise aligner + Hebrew pack** | TRACE v0.1 — paarweise Alignment-Kernel | ✅ released 0.1.3 | -| 2 | **Master alignment graph** | Simultaneous multi-witness alignment (Sifra full witness set, Tanhuma) | planned (v0.2) | +| 2 | **Master alignment graph** | Simultaneous multi-witness alignment (Sifra full witness set, Tanhuma) | in progress (v0.2 feature/v0.2-multi-witness) | | 3 | **Geniza fragment anchor detection** | Matching small fragments against a large candidate pool (hundreds of Sifra Genizah fragments) | planned | | 4 | **Text-reuse detection** | Finding recurring phrases and verbatim citations across a corpus (biblical citations in rabbinic literature, recurring rabbinic formulae) | planned | | 5 | **Apparatus / critical-edition generation** | Producing publication-grade critical editions (lemmas, sigla, Fließtext) directly from alignment output | planned | diff --git a/docs/details.md b/docs/details.md index 242e528..dafa187 100644 --- a/docs/details.md +++ b/docs/details.md @@ -198,3 +198,55 @@ src/tracealign/ escriptorium.py # eScriptorium JSON importer tei.py # TEI XML importer ``` + +## Multi-witness alignment (v0.2) + +`align_multi` extends the pairwise aligner to N witnesses. The pipeline is three-phase: + +### Phase 1 — Pairwise distances + +Every pair of witnesses is aligned with `tracealign.align()` (the v0.1 pairwise aligner) and the distance is computed as `1 − total_score`. The result is a symmetric `N × N` distance matrix; the diagonal is zero. Witness ids are sorted lexicographically before computing, making the matrix independent of dict insertion order. + +### Phase 2 — UPGMA guide tree + +A binary guide tree is built from the distance matrix using **UPGMA** (Unweighted Pair Group Method with Arithmetic Mean). At every iteration the closest cluster pair is merged. Ties are broken on the canonical `(min, max)` lexicographic order of cluster members, guaranteeing determinism. The tree's `height` field carries the cumulative UPGMA distance — a starting point for later stemmatic work. + +### Phase 3 — Progressive POA-based merge + +The guide tree is walked in post-order to produce a canonical merge sequence (closely-related witnesses are merged first). The first witness seeds the graph as a linear chain. Each subsequent witness is aligned to the current graph via **partial-order alignment (POA)** — a DP over the topologically sorted graph nodes. Three transitions: + +| Transition | Effect on graph | +|---|---| +| Match | Merge the new token into an existing node's `tokens[witness_id]`; extend the witness set on the incoming edge. | +| Insertion in sequence (gap in graph) | Add a new node holding only this witness's token; new edge `prev → new`. | +| Deletion (skip graph node) | The new witness's path bypasses this node — recorded by an edge that skips it. | + +`node_match_score` aggregates the per-constituent tiered score across the witnesses already in the target node. The default mode `"max"` is permissive (CollateX-aligned); `"mean"` and `"min"` are configurable. + +### Correctness guarantees + +Two properties are pinned by tests: + +- **Lossless reconstruction.** For every input witness `w`, the path through the result graph yields exactly the original token sequence. +- **Permutation invariance.** The same set of witnesses in any input dict order produces the same alignment (same witness paths, same variant loci). + +### Data flow + +``` +align_multi(witnesses, lang, config) + │ + ▼ +pairwise_distances — Phase 1: O(N²/2) pairwise alignments + │ + ▼ +build_upgma — Phase 2: deterministic binary tree + │ + ▼ +progressive_merge — Phase 3: post-order POA-based merge + │ + ▼ +VariantGraph + │ + ├──► AlignedTable — derived view, re-anchorable + └──► MultiAlignmentResult (graph + table + guide_tree + summary + params) +``` From 7bf848d080f88f2bad856569f209a1f4ec3aa1ce Mon Sep 17 00:00:00 2001 From: bsesic Date: Thu, 28 May 2026 19:19:05 +0200 Subject: [PATCH 27/27] docs: revise README and docs for v0.2 multi-witness features - README: extends the tagline and Highlights to cover both pairwise and multi-witness alignment; adds a from-source install section and a verify step; adds a Quick-start section for align_multi alongside the existing pairwise example; updates the documentation table to mention multi-witness usage and the POA algorithm; refreshes the Project status block with the current PyPI release, v0.2 spec link, released-vs-in-progress stages, and the full long-term sub-project list; rewrites the Citation block to reference the Zenodo concept DOI (now live). - docs/index.md: same tagline / At-a-glance refresh plus a project status block update that lists the full ten-stage roadmap. - docs/faq.md: new entries explaining how multi-witness alignment differs from pairwise, the determinism guarantees, the scale target, the UPGMA-vs-Neighbor-Joining choice, the incremental-add API status, and the persistence module. Sphinx -W build is clean; the test suite stays at 182 passing. --- README.md | 93 ++++++++++++++++++++++++++++++++++++++++----------- docs/faq.md | 41 ++++++++++++++++++++++- docs/index.md | 11 +++--- 3 files changed, 120 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 42590f1..6e472ce 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # TRACE -**Textual Reuse, Alignment, and Collation Engine** — a Python library for pairwise philological alignment with pluggable language packs. +**Textual Reuse, Alignment, and Collation Engine** — a Python library for philological alignment with pluggable language packs. Pairwise (v0.1) and simultaneous multi-witness (v0.2) alignment. [![CI](https://github.com/bsesic/trace/actions/workflows/workflow.yml/badge.svg)](https://github.com/bsesic/trace/actions/workflows/workflow.yml) [![PyPI version](https://img.shields.io/pypi/v/tracealign.svg)](https://pypi.org/project/tracealign/) @@ -17,10 +17,11 @@ TRACE is designed for textual criticism, manuscript witness comparison, and the - **Tokenizer pipeline** with editorial-marker awareness (`[reconstructed]`, `⟦deletion⟧`, `〈insertion〉`, `(expanded)`, lacunae). - **Tiered scoring** returning `(score, reason)` per token pair — `EXACT`, `NIQQUD_STRIPPED`, `PLENE_DEFECTIVE`, `ABBREVIATION`, `ORTHOGRAPHIC`, `INSERTION`, `OMISSION`, `NO_MATCH`. -- **Semi-global Needleman–Wunsch** with affine gap penalties (Gotoh) and a **multi-token abbreviation lookahead** (`ר"י` ↔ `רבי ישמעאל`). +- **Pairwise aligner** — semi-global Needleman–Wunsch with affine gap penalties (Gotoh) and a multi-token abbreviation lookahead (`ר"י` ↔ `רבי ישמעאל`). +- **Multi-witness aligner** (v0.2) — N witnesses aligned simultaneously into a canonical variant graph (DAG) plus a derived aligned table view, via pairwise distances → UPGMA guide tree → POA-based progressive merge. Determinism is pinned by a permutation-invariance property test; correctness by a lossless-reconstruction property test. - **Hebrew language pack** with niqqud strip, plene/defective skeleton matching, gershayim/maqqef tokenizer hooks, and a seed lexicon of rabbinic abbreviations (extendable via `Lexica.merge()`). -- **I/O** for plain text, JSON (round-trip), eScriptorium exports (with bbox + line metadata), and TEI XML (`` mode + flow-text fallback). -- **Reproducible** — every `AlignmentResult` carries `trace_version` and `language_pack_version` in its params. +- **I/O** for plain text, JSON (round-trip for both pairwise and multi-witness results), eScriptorium exports (with bbox + line metadata), and TEI XML (`` mode + flow-text fallback). +- **Reproducible** — every `AlignmentResult` / `MultiAlignmentResult` carries `trace_version` and `language_pack_version` in its params. ## Installation @@ -28,9 +29,27 @@ TRACE is designed for textual criticism, manuscript witness comparison, and the pip install tracealign ``` -Requires Python 3.10+. Pulls `pydantic`, `numpy`, `lxml`, and `rapidfuzz`. +Requires Python 3.10, 3.11, or 3.12. Pulls `pydantic`, `numpy`, `lxml`, and `rapidfuzz`. -## Quick start +### From source + +```bash +git clone https://github.com/bsesic/trace.git +cd trace +pip install -e ".[dev]" +``` + +The `dev` extra adds `pytest` and `flake8` (the project's quality gates). For documentation contributions, use `pip install -e ".[docs]"` to add Sphinx, furo, and myst-parser. + +### Verifying the install + +```bash +python -c "import tracealign; print(tracealign.__version__, tracealign.list_languages())" +``` + +Should print the current version and `['hbo']` (the Hebrew language pack registers itself on import). + +## Quick start — pairwise ```python import tracealign @@ -62,31 +81,67 @@ summary: {EXACT: 3, NIQQUD_STRIPPED: 1, PLENE_DEFECTIVE: 1, ABBREVIATION: 1} אמר ↔ אמר exact 1.00 ``` -See **[the documentation](https://tracealign.readthedocs.io/en/latest/)** for installation details, the full API, FAQs, and the design rationale. +## Quick start — multi-witness (v0.2) + +```python +import tracealign + +witnesses = { + "W1": tracealign.tokenize("שלום עולם רַבִּי דויד אמר", lang="hbo", seq_label="W1"), + "W2": tracealign.tokenize("שלום עולם רבי דוד אמר", lang="hbo", seq_label="W2"), + "W3": tracealign.tokenize("שלום עולם ר\"י אמר", lang="hbo", seq_label="W3"), + "W4": tracealign.tokenize("שלום עולם רבי דוד אמר טוב", lang="hbo", seq_label="W4"), +} + +result = tracealign.align_multi(witnesses, lang="hbo") + +print(result.guide_tree.format_text()) +print(result.table.format_text()) + +for node in result.graph.variants(): + readings = {wid: t.text for wid, t in node.tokens.items()} + print(node.id, readings) +``` + +The `MultiAlignmentResult` exposes a canonical `VariantGraph` (DAG with witness trails), a derived `AlignedTable` (re-anchorable to any witness for presentation), a `GuideTree` (UPGMA-built, carrying the original distance matrix — useful for downstream stemmatic work), and the same reproducibility-aware `params` snapshot the pairwise aligner produces. + +JSON persistence works the same way as the pairwise aligner, in its own module: + +```python +from tracealign.io import multi_result as mr_io + +mr_io.dump(result, "alignment.json") +restored = mr_io.load("alignment.json") +``` + +See **[the documentation](https://tracealign.readthedocs.io/en/latest/)** for the full API, more usage examples, the algorithm details, FAQs, and the design rationale. ## Documentation | Section | What it covers | |---|---| -| [Installation](https://tracealign.readthedocs.io/en/latest/installation.html) | pip / from source / dev setup | -| [Usage](https://tracealign.readthedocs.io/en/latest/usage.html) | Tokenize, align, work with the result, custom lexica | -| [Details](https://tracealign.readthedocs.io/en/latest/details.html) | Tokenizer pipeline, scoring tiers, DP algorithm | -| [FAQ](https://tracealign.readthedocs.io/en/latest/faq.html) | Common questions about scope, language packs, performance | +| [Installation](https://tracealign.readthedocs.io/en/latest/installation.html) | pip / from source / dev setup / docs build | +| [Usage](https://tracealign.readthedocs.io/en/latest/usage.html) | Tokenize, pairwise align, multi-witness align, work with the result, custom lexica, I/O | +| [Details](https://tracealign.readthedocs.io/en/latest/details.html) | Tokenizer pipeline, scoring tiers, pairwise DP algorithm, multi-witness POA pipeline | +| [FAQ](https://tracealign.readthedocs.io/en/latest/faq.html) | Common questions about scope, language packs, performance, multi-witness semantics | | [Contributing](https://tracealign.readthedocs.io/en/latest/contributing.html) | Development workflow, TDD discipline, branch model | ## Project status | | | |---|---| -| Current release | 0.1.1 | -| Roadmap | [docs/ROADMAP.md](docs/ROADMAP.md) | -| Design spec | [docs/superpowers/specs/2026-04-28-trace-v0.1-design.md](docs/superpowers/specs/2026-04-28-trace-v0.1-design.md) | -| Future sub-projects | Multi-witness master graph · Geniza anchor detection · Text-reuse · Critical edition / apparatus | +| Current PyPI release | 0.1.3 (v0.2.0 in flight on `feature/v0.2-multi-witness`) | +| Roadmap | [docs/ROADMAP.md](docs/ROADMAP.md) — ten-stage long-term vision | +| v0.1 design spec | [docs/superpowers/specs/2026-04-28-trace-v0.1-design.md](docs/superpowers/specs/2026-04-28-trace-v0.1-design.md) | +| v0.2 design spec | [docs/superpowers/specs/2026-05-21-trace-v0.2-multi-witness-design.md](docs/superpowers/specs/2026-05-21-trace-v0.2-multi-witness-design.md) | +| Released stages | 1 (pairwise + Hebrew pack) | +| In progress | 2 (master alignment graph / multi-witness) | +| Future sub-projects | Geniza anchor detection · Text-reuse · Apparatus / critical edition · Cross-tradition Hexapla · Stemmatic reconstruction · Allusion detection · Citation graphs · Reception history | -## License +## Citation -[MIT](LICENSE) © 2026 Benjamin Schnabel. +If you use TRACE in academic work, please cite via the [Zenodo concept DOI](https://doi.org/10.5281/zenodo.20315408) (always resolves to the latest archived release) or pick a specific version DOI from the Zenodo record. A `CITATION.cff` is at the repo root — GitHub's "Cite this repository" button generates APA / BibTeX / RIS automatically from it. -## Citation +## License -If you use TRACE in academic work, please cite the repository — a Zenodo DOI will follow with the first non-pre-release tag. +[MIT](LICENSE) © 2026 Benjamin Schnabel. diff --git a/docs/faq.md b/docs/faq.md index 13157df..d20f88f 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -88,4 +88,43 @@ Not specced yet. Candidates from the v0.1 spec: - Per-project editorial-bracket preset bundles. - Performance pass (NumPy vectorization or Cython hot path). -Plus the four long-term sub-projects: master alignment graph, Geniza anchor detection, text-reuse, apparatus generation. +The master alignment graph (multi-witness alignment) shipped as v0.2 — see below. Future long-term stages: Geniza anchor detection, text-reuse, apparatus generation, cross-tradition Hexapla, stemmatic reconstruction, allusion detection, citation graphs, reception history. + +## How does multi-witness alignment differ from pairwise? + +`tracealign.align()` aligns exactly two witnesses. `tracealign.align_multi()` (v0.2) aligns N witnesses at once into a single canonical structure — a variant graph (DAG) where every witness has a trail through the graph, plus a derived aligned table view. Variant loci surface as nodes whose constituent witnesses disagree. + +For two witnesses the two paths give similar information; for three or more the multi-witness graph is much more useful than running every pair separately, because it gives one consistent set of variant positions rather than O(N²) overlapping pairwise alignments. + +## Is `align_multi` deterministic? + +Yes. The result is independent of the dict insertion order of the witnesses. Three sources of order-stability are pinned by tests: + +1. `pairwise_distances` sorts witness ids lexicographically before computing the matrix. +2. UPGMA tie-breaking uses the canonical `(min, max)` lexicographic order of cluster members. +3. The topological sort during sequence-vs-graph alignment is stable with respect to node id. + +A dedicated property test (`test_permutation_invariance`) re-runs `align_multi` with reordered inputs and asserts that witness paths and variant loci are identical. + +## How big can multi-witness alignments get? + +The v0.2 target is Sifra-scale: 5–15 witnesses, 1000–5000 tokens each. Larger witness sets (NT-scale, hundreds of witnesses) need anchor-based decomposition, which is a future stage. Geniza fragments specifically are handled in their own future stage (anchor detection against a large candidate pool), not by adding them all to one master graph. + +## Why UPGMA and not Neighbor-Joining for the guide tree? + +UPGMA is simpler and gives a binary tree with clear cumulative-distance heights — useful as a draft stemma input for the eventual stemmatic-reconstruction stage. UPGMA's "molecular clock" assumption is a known limitation in phylogenetics but is acceptable for ordering the merge sequence in v0.2. Neighbor-Joining is a future v0.x candidate when proper stemmatic reconstruction goes live. + +## Can I add a new witness to an existing alignment incrementally? + +Not in v0.2.0 — `align_multi` builds the entire graph in a single call. An incremental "add one witness" API is a v0.2.x candidate; it builds naturally on the existing `align_sequence_to_graph` primitive but requires API design (e.g. should the guide tree be re-balanced? should existing alignment relationships be allowed to change?). Open a discussion or issue if you need this. + +## How do I persist a multi-witness result? + +```python +from tracealign.io import multi_result as mr_io + +mr_io.dump(result, "alignment.json") +restored = mr_io.load("alignment.json") +``` + +`tracealign.io.multi_result` is a dedicated module separate from `tracealign.io.result` (the pairwise JSON I/O). The round-trip preserves the entire result, including the guide tree's distance matrix — important for later stages that reuse it. diff --git a/docs/index.md b/docs/index.md index 8ab2b40..952cad9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,6 @@ # TRACE -**Textual Reuse, Alignment, and Collation Engine** — a Python library for pairwise philological alignment with pluggable language packs. +**Textual Reuse, Alignment, and Collation Engine** — a Python library for philological alignment with pluggable language packs. Pairwise (v0.1) and simultaneous multi-witness (v0.2) alignment. TRACE is built for textual criticism, manuscript witness comparison, and the creation of digital synopses and critical editions. The core is language-agnostic; the first shipped language pack covers Biblical and Rabbinic Hebrew (`hbo`). @@ -8,10 +8,11 @@ TRACE is built for textual criticism, manuscript witness comparison, and the cre - **Tokenizer pipeline** with editorial-marker awareness (`[reconstructed]`, `⟦deletion⟧`, `〈insertion〉`, `(expanded)`, lacunae). - **Tiered scoring** that returns *(score, reason)* per token pair — `EXACT`, `NIQQUD_STRIPPED`, `PLENE_DEFECTIVE`, `ABBREVIATION`, `ORTHOGRAPHIC`, `INSERTION`, `OMISSION`, `NO_MATCH`. -- **Semi-global Needleman–Wunsch** with affine gap penalties (Gotoh) and a multi-token abbreviation lookahead (`ר"י` ↔ `רבי ישמעאל`). +- **Pairwise aligner** — semi-global Needleman–Wunsch with affine gap penalties (Gotoh) and a multi-token abbreviation lookahead (`ר"י` ↔ `רבי ישמעאל`). +- **Multi-witness aligner** (v0.2) — N witnesses aligned simultaneously into a canonical variant graph plus a derived aligned table, via pairwise distances → UPGMA guide tree → POA-based progressive merge. Determinism and lossless reconstruction are pinned by property tests. - **Hebrew language pack** with niqqud strip, plene/defective skeleton matching, gershayim/maqqef tokenizer hooks, and a seed lexicon of rabbinic abbreviations (extendable via `Lexica.merge()`). -- **I/O** for plain text, JSON (round-trip), eScriptorium exports, and TEI XML. -- **Reproducible**: every `AlignmentResult` carries `trace_version` and `language_pack_version` in its params. +- **I/O** for plain text, JSON (round-trip for both pairwise and multi-witness results), eScriptorium exports, and TEI XML. +- **Reproducible**: every `AlignmentResult` / `MultiAlignmentResult` carries `trace_version` and `language_pack_version` in its params. ## Get going @@ -28,7 +29,7 @@ contributing ## Project status -TRACE is an early-stage research library. v0.1.x ships the pairwise aligner and the Hebrew pack; future sub-projects cover multi-witness master graphs, Geniza fragment anchor detection, text-reuse detection, and apparatus / critical-edition generation. See the [roadmap](https://github.com/bsesic/trace/blob/main/docs/ROADMAP.md) for the long-term plan. +TRACE is an early-stage research library. v0.1.x ships the pairwise aligner and the Hebrew pack; v0.2 adds the multi-witness master alignment graph. Future stages cover Geniza fragment anchor detection, text-reuse detection, apparatus / critical-edition generation, cross-tradition Hexapla-style alignment, stemmatic reconstruction, allusion detection, citation graphs, and multi-millennial reception history. See the [roadmap](https://github.com/bsesic/trace/blob/main/docs/ROADMAP.md) for the long-term ten-stage plan. ## License