From 6df62a7f4ecf1d0bcb3fcddc87fd1e2b61dc9c67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?=
 <semidan.robaina@gmail.com>
Date: Fri, 19 Jun 2026 14:00:05 +0100
Subject: [PATCH 1/2] Add Pynteny MCP server (mcp/) for LLM agents

Adds an MCP (Model Context Protocol) server under mcp/ that exposes the Pynteny
API as 6 tools (info, validate/parse synteny structure, build, search, download)
so MCP-compatible agents (Claude, DeepSeek, etc.) can drive Pynteny in natural
language. Includes a no-API-key smoke test and a Claude+DeepSeek example agent.

This lives in the repo but is intentionally kept out of the pynteny PyPI package
(packaging includes only src/pynteny), so it does not affect the distribution.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 mcp/.env.example                     |  20 ++
 mcp/.gitignore                       |  13 +
 mcp/README.md                        | 174 +++++++++++
 mcp/examples/synteny_search_agent.py | 290 ++++++++++++++++++
 mcp/requirements.txt                 |  17 ++
 mcp/src/pynteny_mcp/__init__.py      |   3 +
 mcp/src/pynteny_mcp/server.py        | 249 ++++++++++++++++
 mcp/src/pynteny_mcp/service.py       | 430 +++++++++++++++++++++++++++
 mcp/tests/smoke_test.py              | 152 ++++++++++
 9 files changed, 1348 insertions(+)
 create mode 100644 mcp/.env.example
 create mode 100644 mcp/.gitignore
 create mode 100644 mcp/README.md
 create mode 100644 mcp/examples/synteny_search_agent.py
 create mode 100644 mcp/requirements.txt
 create mode 100644 mcp/src/pynteny_mcp/__init__.py
 create mode 100644 mcp/src/pynteny_mcp/server.py
 create mode 100644 mcp/src/pynteny_mcp/service.py
 create mode 100644 mcp/tests/smoke_test.py

diff --git a/mcp/.env.example b/mcp/.env.example
new file mode 100644
index 0000000..95a0c04
--- /dev/null
+++ b/mcp/.env.example
@@ -0,0 +1,20 @@
+# Copy this file to `.env` and fill in your keys. `.env` is gitignored.
+# Used by examples/synteny_search_agent.py (the MCP server itself needs no keys).
+
+# --- Claude (Anthropic) ---
+ANTHROPIC_API_KEY=sk-ant-...
+ANTHROPIC_MODEL=claude-opus-4-8
+
+# --- DeepSeek (OpenAI-compatible API) ---
+DEEPSEEK_API_KEY=sk-...
+DEEPSEEK_BASE_URL=https://api.deepseek.com
+DEEPSEEK_MODEL=deepseek-v4-pro
+
+# --- Optional Pynteny defaults for the example agent ---
+# These let the agent run searches without you re-specifying paths each time.
+# A labelled peptide database (output of `build_peptide_database`) to search:
+# PYNTENY_DATA=/abs/path/to/labelled_peptides.faa
+# Directory of HMM files (e.g. the downloaded PGAP database):
+# PYNTENY_HMM_DIR=/abs/path/to/data/hmms
+# HMM metadata TSV (PGAP/PFAM), needed for gene-symbol searches:
+# PYNTENY_HMM_META=/abs/path/to/data/hmms/hmm_PGAP.tsv
diff --git a/mcp/.gitignore b/mcp/.gitignore
new file mode 100644
index 0000000..2fd5649
--- /dev/null
+++ b/mcp/.gitignore
@@ -0,0 +1,13 @@
+# Secrets — never commit real API keys
+.env
+.env.local
+*.env
+!.env.example
+
+# Python build / cache
+__pycache__/
+*.py[cod]
+*.egg-info/
+build/
+dist/
+.venv/
diff --git a/mcp/README.md b/mcp/README.md
new file mode 100644
index 0000000..e671696
--- /dev/null
+++ b/mcp/README.md
@@ -0,0 +1,174 @@
+# Pynteny MCP server
+
+A [Model Context Protocol](https://modelcontextprotocol.io) server that exposes
+the [Pynteny](../) API as tools, so any MCP-compatible agent — Claude Desktop,
+Claude Code, or a custom client driving Claude / DeepSeek / etc. — can run
+synteny-aware HMM searches over genomic sequence data in natural language.
+
+It wraps `pynteny.api` (`Search` / `Build` / `Download`) and the synteny-structure
+parsers, returning **compact JSON summaries** (the matched hits table and the
+paths of the files Pynteny writes) instead of dumping large objects.
+
+```
+mcp/
+├── pyproject.toml             # installable package: `pynteny-mcp`
+├── requirements.txt           # server + example deps
+├── .env.example               # copy to .env and fill in keys (gitignored)
+├── src/pynteny_mcp/
+│   ├── server.py              # FastMCP server + tool definitions
+│   └── service.py             # API wrappers, stdout-safe logging, JSON summaries
+├── examples/
+│   └── synteny_search_agent.py    # LLM agent (Claude + DeepSeek) demo
+└── tests/
+    └── smoke_test.py          # no-API-key end-to-end check against the test data
+```
+
+## What is a synteny structure?
+
+A synteny structure describes a target gene arrangement, e.g.
+
+```
+>leuD 0 >leuC 1 <leuA
+```
+
+where each token is an **HMM name** (or a **gene symbol**, with `gene_ids=true`),
+the leading `>`/`<` gives the **strand** (sense / antisense), and the integers are
+the **maximum number of (untargeted) genes** allowed between neighbours. Groups of
+interchangeable HMMs for one gene are written `(HMM_A|HMM_B)`.
+
+## Tools
+
+| Tool | What it does |
+|------|--------------|
+| `get_pynteny_info` | Pynteny version, citation, and which HMM databases (PGAP/PFAM) are downloaded |
+| `validate_synteny_structure` | Validate & decompose a structure (HMMs, strands, distances) — no I/O |
+| `parse_gene_symbols` | Translate a gene-symbol structure into HMM names via a metadata table |
+| `build_peptide_database` | Predict ORFs and label them from a nucleotide assembly / GenBank |
+| `run_synteny_search` | **Core:** run the synteny-aware HMM search; return matched hits + output paths (incl. `best_hmm_wins` for paralog cross-hits) |
+| `download_hmm_databases` | Download the PGAP and/or PFAM profile-HMM databases |
+
+## Prerequisites
+
+A Python ≥ 3.8 environment with **Pynteny** installed. Pynteny is now a
+pure-Python pip package — HMMER and Prodigal come bundled via
+[pyhmmer](https://github.com/althonos/pyhmmer) and
+[pyrodigal](https://github.com/althonos/pyrodigal), so **no conda environment or
+external binaries are required**:
+
+```bash
+pip install pynteny            # or: pip install git+https://github.com/Robaina/Pynteny.git
+```
+
+## Install
+
+Install the server (and example) dependencies into that same environment:
+
+```bash
+cd mcp
+pip install -r requirements.txt          # mcp, pynteny, anthropic, openai, python-dotenv
+# optional — register the `pynteny-mcp` console script:
+pip install -e .
+```
+
+## Configure
+
+Only the example agent needs API keys; the server itself does not.
+
+```bash
+cp .env.example .env        # .env is gitignored
+```
+
+Edit `.env`:
+
+```ini
+ANTHROPIC_API_KEY=sk-ant-...
+DEEPSEEK_API_KEY=sk-...
+# optional defaults so the agent need not repeat paths:
+# PYNTENY_DATA=/abs/path/to/labelled_peptides.faa
+# PYNTENY_HMM_DIR=/abs/path/to/data/hmms
+# PYNTENY_HMM_META=/abs/path/to/data/hmms/hmm_PGAP.tsv
+```
+
+## Verify (no API keys needed)
+
+Runs the server against Pynteny's committed test genome and asserts the known
+*leu*-operon synteny hits:
+
+```bash
+python tests/smoke_test.py
+# → "All smoke-test checks passed."
+```
+
+## Run the example agent
+
+An LLM decides how to validate the structure and run the search to answer a
+question. By default it searches Pynteny's test genome
+(`../tests/test_data/MG1655.fasta`), so it works with no extra data; set
+`PYNTENY_DATA` / `PYNTENY_HMM_DIR` / `PYNTENY_HMM_META` to search your own.
+
+```bash
+python examples/synteny_search_agent.py --provider claude
+python examples/synteny_search_agent.py --provider deepseek
+python examples/synteny_search_agent.py --provider claude \
+    --question "Is leuD-leuC-leuA syntenic in this genome, and on which strand?"
+```
+
+The example launches the MCP server itself (as a stdio subprocess using the same
+Python interpreter), connects as an MCP client, converts the MCP tool schemas to
+each provider's tool format, and runs a manual tool-use loop.
+
+> **Models.** Claude defaults to `claude-opus-4-8` with adaptive thinking;
+> DeepSeek defaults to `deepseek-v4-pro`. Override via `ANTHROPIC_MODEL` /
+> `DEEPSEEK_MODEL` in `.env`.
+
+## A typical end-to-end workflow
+
+For real data (not the bundled test genome) the agent (or you) would:
+
+1. `download_hmm_databases(outdir="data/hmms")` — once, to fetch PGAP HMMs + metadata.
+2. `build_peptide_database(data="assembly.fa", outfile="labelled_peptides.faa")` — predict & label ORFs.
+3. `validate_synteny_structure(">leuD 0 >leuC 1 <leuA")` — sanity-check the query.
+4. `run_synteny_search(data="labelled_peptides.faa", synteny_structure=">leuD 0 >leuC 1 <leuA", gene_ids=true)` — search.
+
+## Register with an MCP client
+
+### Claude Desktop / Claude Code
+
+Add to your MCP config (`claude_desktop_config.json`, or Claude Code's
+`.mcp.json`). Use the absolute interpreter path of the environment where
+`pynteny` + `mcp` are installed, and put `src/` on `PYTHONPATH`:
+
+```json
+{
+  "mcpServers": {
+    "pynteny": {
+      "command": "/path/to/python",
+      "args": ["-m", "pynteny_mcp.server"],
+      "env": {
+        "PYTHONPATH": "/home/robaina/Documents/Hapdera/Hapdera-Projects/Pynteny/mcp/src"
+      }
+    }
+  }
+}
+```
+
+If you ran `pip install -e .`, you can use the console script instead:
+`"command": ".../bin/pynteny-mcp"` with no `args`/`PYTHONPATH`.
+
+## Notes
+
+- **stdout is sacred.** The stdio transport uses stdout for protocol traffic, but
+  Pynteny logs and prints to stdout. The service layer wraps every Pynteny call
+  so that its stdout (and logging) is redirected to stderr, keeping the protocol
+  stream clean. Set `PYNTENY_MCP_DEBUG=1` to restore the MCP runtime's verbose
+  per-request logging.
+- **No big in-memory database.** Unlike a database server, each tool call is a
+  Pynteny operation; `run_synteny_search` runs HMMER and writes per-gene FASTA
+  files plus a `synteny_matched.tsv` table to `outdir`.
+- **`best_hmm_wins`.** When one peptide is hit by several HMMs (paralog
+  cross-hits), this keeps only the highest-scoring HMM for that peptide before
+  matching the structure. It requires Pynteny ≥ 1.3.0; on an older build the
+  service ignores it and returns a `warning` field rather than failing.
+- **DeepSeek** is reached through its OpenAI-compatible endpoint, so the example
+  uses the `openai` SDK pointed at `DEEPSEEK_BASE_URL`. The same MCP server and
+  tool set serve both providers unchanged.
diff --git a/mcp/examples/synteny_search_agent.py b/mcp/examples/synteny_search_agent.py
new file mode 100644
index 0000000..831e308
--- /dev/null
+++ b/mcp/examples/synteny_search_agent.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+"""
+Example: an LLM agent that runs Pynteny synteny searches through the MCP server.
+
+Instead of hand-writing the Pynteny calls, an LLM is given the MCP toolbox and
+decides how to validate a synteny structure and run the search to answer a
+question — e.g. "find the leucine-biosynthesis (leuD-leuC-leuA) synteny block in
+this genome".
+
+The same MCP toolbox is driven by two providers so you can compare them:
+
+* Claude   — via the official `anthropic` SDK (`claude-opus-4-8` by default)
+* DeepSeek — via the OpenAI-compatible `openai` SDK
+
+Both connect to the *same* MCP server (``pynteny_mcp.server``) launched as a
+subprocess over stdio; only the model and the tool-schema adapter differ.
+
+Usage:
+    cp .env.example .env      # then fill in your API keys (+ optional PYNTENY_* paths)
+    python examples/synteny_search_agent.py --provider claude
+    python examples/synteny_search_agent.py --provider deepseek
+    python examples/synteny_search_agent.py --provider claude --question "..."
+
+By default it runs against Pynteny's committed test genome
+(../tests/test_data/MG1655.fasta) so it works with no extra data; point
+PYNTENY_DATA / PYNTENY_HMM_DIR / PYNTENY_HMM_META at your own files to search
+real data.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import os
+import sys
+from datetime import timedelta
+from pathlib import Path
+
+from dotenv import load_dotenv
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+
+HERE = Path(__file__).resolve().parent
+MCP_DIR = HERE.parent
+SRC_DIR = MCP_DIR / "src"
+REPO_ROOT = MCP_DIR.parent
+ENV_PATH = MCP_DIR / ".env"
+
+load_dotenv(ENV_PATH)
+
+# Fall back to Pynteny's committed test data so the demo runs out of the box.
+TEST_DATA = REPO_ROOT / "tests" / "test_data"
+DEFAULT_DATA = os.environ.get("PYNTENY_DATA", str(TEST_DATA / "MG1655.fasta"))
+DEFAULT_HMM_DIR = os.environ.get("PYNTENY_HMM_DIR", str(TEST_DATA / "hmms"))
+DEFAULT_HMM_META = os.environ.get("PYNTENY_HMM_META", str(TEST_DATA / "hmm_meta.tsv"))
+
+# Per-tool-call timeout: a search runs HMMER over the whole database.
+TOOL_TIMEOUT = timedelta(seconds=600)
+MAX_TURNS = 14  # guard against runaway tool loops
+
+SYSTEM_PROMPT = (
+    "You are a comparative-genomics assistant with access to Pynteny through "
+    "tools. Pynteny finds synteny blocks: genes hit by profile HMMs whose "
+    "genomic arrangement (order, strand, spacing) matches a synteny structure "
+    "like '>leuD 0 >leuC 1 <leuA'. Use the tools rather than prior knowledge: "
+    "validate a structure before searching, then run the search and report the "
+    "matched genes (contig, gene id, strand, HMM, and gene symbol/product when "
+    "available) and where the result files were written. Be concise.\n\n"
+    f"Unless told otherwise, search this database: {DEFAULT_DATA}\n"
+    f"Using this HMM directory: {DEFAULT_HMM_DIR}\n"
+    f"And this HMM metadata file: {DEFAULT_HMM_META}\n"
+    "Pass reuse=true so repeated runs don't recompute HMMER outputs."
+)
+
+DEFAULT_QUESTION = (
+    "Find the leucine-biosynthesis synteny block in the genome. The genes "
+    "leuD, leuC and leuA are expected adjacent on the antisense strand. Their "
+    "HMMs are leuD=(TIGR00171.1|TIGR02084.1), leuC=(TIGR00170.1|TIGR02083.1), "
+    "leuA=(TIGR00973.1|NF002084.0|TIGR00970.1). Allow 0 genes between leuD and "
+    "leuC and up to 1 between leuC and leuA. Which genes match, and on which "
+    "contig?"
+)
+
+
+# --------------------------------------------------------------------------- #
+# MCP toolbox: connect to the server and adapt its tools to each provider     #
+# --------------------------------------------------------------------------- #
+class MCPToolbox:
+    def __init__(self, session: ClientSession):
+        self.session = session
+        self.tools = []
+
+    async def load(self) -> None:
+        self.tools = (await self.session.list_tools()).tools
+
+    async def call(self, name: str, arguments: dict) -> str:
+        result = await self.session.call_tool(
+            name, arguments, read_timeout_seconds=TOOL_TIMEOUT
+        )
+        parts = [getattr(b, "text", "") for b in result.content if getattr(b, "text", "")]
+        text = "\n".join(parts).strip()
+        if not text and getattr(result, "structuredContent", None):
+            text = json.dumps(result.structuredContent)
+        if getattr(result, "isError", False):
+            return f"ERROR: {text or '(unknown tool error)'}"
+        return text or "(no content)"
+
+    def anthropic_tools(self) -> list[dict]:
+        return [
+            {"name": t.name, "description": t.description or "", "input_schema": t.inputSchema}
+            for t in self.tools
+        ]
+
+    def openai_tools(self) -> list[dict]:
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": t.name,
+                    "description": t.description or "",
+                    "parameters": t.inputSchema,
+                },
+            }
+            for t in self.tools
+        ]
+
+
+def _log_tool_call(name: str, arguments: dict) -> None:
+    print(f"  → tool: {name}({json.dumps(arguments, ensure_ascii=False)})")
+
+
+# --------------------------------------------------------------------------- #
+# Claude backend (official anthropic SDK, manual agentic loop)                #
+# --------------------------------------------------------------------------- #
+async def run_claude(toolbox: MCPToolbox, question: str) -> str:
+    from anthropic import AsyncAnthropic
+
+    model = os.environ.get("ANTHROPIC_MODEL", "claude-opus-4-8")
+    effort = os.environ.get("ANTHROPIC_EFFORT", "medium")
+    client = AsyncAnthropic()  # reads ANTHROPIC_API_KEY from the environment
+    tools = toolbox.anthropic_tools()
+    messages: list[dict] = [{"role": "user", "content": question}]
+
+    for _ in range(MAX_TURNS):
+        resp = await client.messages.create(
+            model=model,
+            max_tokens=16000,
+            system=SYSTEM_PROMPT,
+            tools=tools,
+            # Adaptive thinking lets Claude decide how much to reason between
+            # tool calls; effort trades thoroughness against token cost.
+            thinking={"type": "adaptive", "display": "summarized"},
+            output_config={"effort": effort},
+            messages=messages,
+        )
+
+        for block in resp.content:
+            if block.type == "thinking" and getattr(block, "thinking", ""):
+                print(f"  [thinking] {block.thinking.strip()[:300]}")
+            elif block.type == "text" and block.text.strip():
+                print(f"  [claude] {block.text.strip()}")
+
+        if resp.stop_reason != "tool_use":
+            return "".join(b.text for b in resp.content if b.type == "text").strip()
+
+        messages.append({"role": "assistant", "content": resp.content})
+        tool_results = []
+        for block in resp.content:
+            if block.type == "tool_use":
+                _log_tool_call(block.name, block.input)
+                output = await toolbox.call(block.name, block.input)
+                tool_results.append(
+                    {"type": "tool_result", "tool_use_id": block.id, "content": output}
+                )
+        messages.append({"role": "user", "content": tool_results})
+
+    return "(stopped: reached the maximum number of tool-use turns)"
+
+
+# --------------------------------------------------------------------------- #
+# DeepSeek backend (OpenAI-compatible API, manual function-calling loop)       #
+# --------------------------------------------------------------------------- #
+async def run_deepseek(toolbox: MCPToolbox, question: str) -> str:
+    from openai import AsyncOpenAI
+
+    model = os.environ.get("DEEPSEEK_MODEL", "deepseek-v4-pro")
+    client = AsyncOpenAI(
+        api_key=os.environ["DEEPSEEK_API_KEY"],
+        base_url=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
+    )
+    tools = toolbox.openai_tools()
+    messages: list[dict] = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": question},
+    ]
+
+    for _ in range(MAX_TURNS):
+        resp = await client.chat.completions.create(
+            model=model, messages=messages, tools=tools
+        )
+        msg = resp.choices[0].message
+
+        if msg.content and msg.content.strip():
+            print(f"  [deepseek] {msg.content.strip()}")
+
+        assistant: dict = {"role": "assistant", "content": msg.content or ""}
+        if msg.tool_calls:
+            assistant["tool_calls"] = [
+                {
+                    "id": tc.id,
+                    "type": "function",
+                    "function": {"name": tc.function.name, "arguments": tc.function.arguments},
+                }
+                for tc in msg.tool_calls
+            ]
+        messages.append(assistant)
+
+        if not msg.tool_calls:
+            return (msg.content or "").strip()
+
+        for tc in msg.tool_calls:
+            try:
+                arguments = json.loads(tc.function.arguments or "{}")
+            except json.JSONDecodeError:
+                arguments = {}
+            _log_tool_call(tc.function.name, arguments)
+            output = await toolbox.call(tc.function.name, arguments)
+            messages.append({"role": "tool", "tool_call_id": tc.id, "content": output})
+
+    return "(stopped: reached the maximum number of tool-use turns)"
+
+
+# --------------------------------------------------------------------------- #
+# Driver                                                                       #
+# --------------------------------------------------------------------------- #
+async def main_async(provider: str, question: str) -> None:
+    # Launch the MCP server as a subprocess, using the *same* interpreter (so it
+    # shares this environment's `pynteny` + `mcp`), with src/ on PYTHONPATH.
+    server_params = StdioServerParameters(
+        command=sys.executable,
+        args=["-m", "pynteny_mcp.server"],
+        env={**os.environ, "PYTHONPATH": str(SRC_DIR)},
+    )
+
+    async with stdio_client(server_params) as (read, write):
+        async with ClientSession(read, write) as session:
+            await session.initialize()
+            toolbox = MCPToolbox(session)
+            await toolbox.load()
+
+            print(f"Connected to Pynteny MCP server: {len(toolbox.tools)} tools available")
+            print(f"Provider: {provider}\n")
+            print(f"Question:\n  {question}\n")
+            print("--- agent trace ---")
+
+            try:
+                if provider == "claude":
+                    answer = await run_claude(toolbox, question)
+                else:
+                    answer = await run_deepseek(toolbox, question)
+            except Exception as exc:  # noqa: BLE001 — surface a clean message, not a stack trace
+                print(f"\nLLM call failed ({type(exc).__name__}): {exc}")
+                return
+
+            print("\n--- final answer ---")
+            print(answer)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--provider", choices=["claude", "deepseek"], default="claude",
+        help="Which LLM backend to use (default: claude).",
+    )
+    parser.add_argument(
+        "--question", default=DEFAULT_QUESTION,
+        help="Question to ask the agent (defaults to the leu-operon example).",
+    )
+    args = parser.parse_args()
+
+    key_var = "ANTHROPIC_API_KEY" if args.provider == "claude" else "DEEPSEEK_API_KEY"
+    if not os.environ.get(key_var):
+        sys.exit(f"{key_var} is not set. Add it to {ENV_PATH} (see .env.example).")
+
+    asyncio.run(main_async(args.provider, args.question))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mcp/requirements.txt b/mcp/requirements.txt
new file mode 100644
index 0000000..a8e4d5e
--- /dev/null
+++ b/mcp/requirements.txt
@@ -0,0 +1,17 @@
+# Runtime dependencies for the Pynteny MCP server + the example LLM agent.
+# Install into the same environment that has `pynteny` (a pure-Python pip
+# package — no conda or external binaries required):
+#
+#     pip install pynteny            # if not already installed
+#     pip install -r requirements.txt
+#
+# The server itself only needs `mcp` and `pynteny`; anthropic/openai/
+# python-dotenv are only used by examples/synteny_search_agent.py.
+
+mcp>=1.2.0
+pynteny>=1.3.0
+
+# --- example agent only ---
+anthropic>=0.40
+openai>=1.40
+python-dotenv>=1.0
diff --git a/mcp/src/pynteny_mcp/__init__.py b/mcp/src/pynteny_mcp/__init__.py
new file mode 100644
index 0000000..7fa0266
--- /dev/null
+++ b/mcp/src/pynteny_mcp/__init__.py
@@ -0,0 +1,3 @@
+"""Pynteny MCP server — expose the Pynteny API to MCP-compatible LLM agents."""
+
+__version__ = "0.1.0"
diff --git a/mcp/src/pynteny_mcp/server.py b/mcp/src/pynteny_mcp/server.py
new file mode 100644
index 0000000..9080da9
--- /dev/null
+++ b/mcp/src/pynteny_mcp/server.py
@@ -0,0 +1,249 @@
+"""
+Pynteny MCP server.
+
+Exposes the Pynteny API as Model Context Protocol tools so that any
+MCP-compatible agent (Claude Desktop, Claude Code, or a custom client driving
+Claude / DeepSeek / etc.) can run synteny-aware HMM searches over genomic
+sequence data in natural language.
+
+Pynteny (https://github.com/Robaina/Pynteny) finds synteny blocks in
+(prokaryotic) sequence data: it searches a labelled peptide database with
+profile HMMs (via HMMER / pyhmmer) and keeps only the hits whose genomic
+arrangement — order, strand and gene spacing — matches a user-supplied synteny
+structure such as ``>leuD 0 >leuC 1 <leuA``.
+
+Run it (stdio transport, the default):
+
+    pynteny-mcp
+    # or, without installing this package:
+    PYTHONPATH=src python -m pynteny_mcp.server
+
+Pynteny is a pure-Python pip package (HMMER and Prodigal come bundled via
+pyhmmer / pyrodigal), so no conda environment or external binaries are needed —
+only an environment in which `pip install pynteny` has been run.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from mcp.server.fastmcp import FastMCP
+
+from . import service
+
+INSTRUCTIONS = """\
+Tools to run Pynteny synteny-aware HMM searches over genomic sequence data.
+
+A *synteny structure* describes a target gene arrangement, e.g.
+  ">leuD 0 >leuC 1 <leuA"
+where each token is an HMM name (or gene symbol, with gene_ids=True), the
+leading ">"/"<" is the strand (sense / antisense), and the integers are the
+maximum number of (untargeted) genes allowed between neighbours. Groups of
+interchangeable HMMs for one gene are written "(HMM_A|HMM_B)".
+
+Typical workflow:
+  1. `get_pynteny_info` — confirm the install and see which HMM databases
+     (PGAP / PFAM) are already downloaded.
+  2. `validate_synteny_structure` — cheaply check/parse a structure before a run.
+  3. (once) `download_hmm_databases` — fetch the PGAP HMMs + metadata, unless you
+     already have an `hmm_dir`.
+  4. `build_peptide_database` — turn a nucleotide assembly / GenBank file into the
+     labelled peptide FASTA that search consumes (skip if you already have one).
+  5. `run_synteny_search` — the core tool: returns the matched hits table and the
+     paths of the FASTA / TSV files Pynteny writes.
+
+Use `parse_gene_symbols` to translate a gene-symbol structure into HMM names
+against a metadata file. Searching by gene symbol directly is done by passing
+gene_ids=True to `run_synteny_search`.
+"""
+
+mcp = FastMCP("pynteny", instructions=INSTRUCTIONS)
+
+
+@mcp.tool()
+def get_pynteny_info() -> dict:
+    """Return the installed Pynteny version, author, citation, and which HMM
+    databases (PGAP / PFAM) are currently downloaded and configured. A good
+    first call to confirm the server is wired up and to decide whether you need
+    to download HMMs before searching."""
+    return service.pynteny_info()
+
+
+@mcp.tool()
+def validate_synteny_structure(synteny_structure: str) -> dict:
+    """Validate and decompose a synteny structure string without running any
+    search or touching any file. Returns whether it is well-formed plus its
+    parsed parts: the HMM groups, the flat list of HMM names, the strand of each
+    gene, and the max-distance constraints between neighbours. Use this to catch
+    formatting mistakes before an expensive `run_synteny_search`.
+
+    Args:
+        synteny_structure: e.g. ">leuD 0 >leuC 1 <leuA". Tokens are HMM names (or
+            gene symbols), ">"/"<" give the strand, integers are the maximum
+            number of genes allowed between neighbours, and "(A|B)" groups
+            interchangeable HMMs for one gene.
+    """
+    return service.validate_structure(synteny_structure)
+
+
+@mcp.tool()
+def parse_gene_symbols(synteny_structure: str, hmm_meta: Optional[str] = None) -> dict:
+    """Translate a synteny structure written with *gene symbols* (e.g. "leuD")
+    into one written with the corresponding *HMM names* (e.g. "TIGR00171.1"),
+    using a PGAP/PFAM metadata table.
+
+    Args:
+        synteny_structure: gene-symbol structure, e.g. ">leuD 0 >leuC 1 <leuA".
+        hmm_meta: path to the HMM metadata TSV. If omitted, the PGAP metadata
+            file recorded in Pynteny's config (from a previous download) is used.
+    """
+    return service.parse_gene_symbols(synteny_structure, hmm_meta)
+
+
+@mcp.tool()
+def build_peptide_database(
+    data: str,
+    outfile: Optional[str] = None,
+    prepend: bool = False,
+    processes: Optional[int] = None,
+    tempdir: Optional[str] = None,
+    logfile: Optional[str] = None,
+) -> dict:
+    """Build a labelled peptide database from a nucleotide assembly (or a GenBank
+    file / directory). ORFs are predicted (Prodigal/pyrodigal) and each peptide
+    is labelled with its contig and gene-location info so that synteny search can
+    reason about gene order and strand. Returns the output file path and the
+    number of peptides written.
+
+    Args:
+        data: path to the input assembly FASTA, GenBank file, or a directory of
+            such files.
+        outfile: path for the labelled peptide FASTA. Defaults to a name derived
+            from `data`.
+        prepend: when `data` is a directory, prepend each file's name to its
+            sequence IDs (keeps IDs unique across genomes).
+        processes: max worker processes (defaults to all CPUs minus one).
+        tempdir: directory for temporary files.
+        logfile: path to a log file (logs otherwise go to stderr/devnull).
+    """
+    return service.build_database(
+        data,
+        outfile=outfile,
+        prepend=prepend,
+        processes=processes,
+        tempdir=tempdir,
+        logfile=logfile,
+    )
+
+
+@mcp.tool()
+def run_synteny_search(
+    data: str,
+    synteny_structure: str,
+    hmm_dir: Optional[str] = None,
+    hmm_meta: Optional[str] = None,
+    gene_ids: bool = False,
+    unordered: bool = False,
+    best_hmm_wins: bool = False,
+    reuse: bool = False,
+    outdir: Optional[str] = None,
+    prefix: str = "",
+    hmmsearch_args: Optional[str] = None,
+    processes: Optional[int] = None,
+    logfile: Optional[str] = None,
+    max_hits: int = 200,
+) -> dict:
+    """Search a labelled peptide database for genes arranged in the given synteny
+    structure. This is the core Pynteny operation. It runs HMMER for each HMM,
+    then keeps only hits whose genomic context (order, strand, spacing) matches
+    the structure. Returns the matched hits as records, plus the paths of the
+    per-gene FASTA files and the `synteny_matched.tsv` table Pynteny writes.
+
+    Args:
+        data: path to the labelled peptide database (output of
+            `build_peptide_database`).
+        synteny_structure: target arrangement, e.g. ">leuD 0 >leuC 1 <leuA"
+            (HMM names), or gene symbols if `gene_ids=True`.
+        hmm_dir: directory containing the HMM files. If omitted, the downloaded
+            PGAP directory from Pynteny's config is used; if none is available
+            the call returns an error rather than triggering a large download.
+        hmm_meta: HMM metadata TSV; needed for `gene_ids=True` and used to
+            annotate hits with gene symbol / product / EC number when available.
+        gene_ids: treat tokens in `synteny_structure` as gene symbols instead of
+            HMM names (translated via `hmm_meta`).
+        unordered: match a syntenic *set* in any order instead of the exact
+            collinear order given.
+        best_hmm_wins: when one peptide is hit by several HMMs (paralog
+            cross-hits), keep only the highest-scoring HMM for that peptide.
+        reuse: reuse existing HMMER outputs in `outdir` instead of re-running.
+        outdir: output directory (defaults to the directory of `data`).
+        prefix: prefix for output file names.
+        hmmsearch_args: extra hmmsearch arguments (a single string applied to all
+            HMMs, or comma-separated per-HMM with "None" to skip one).
+        processes: max worker processes (defaults to all CPUs minus one).
+        logfile: path to a log file.
+        max_hits: cap on the number of hit records returned inline (the full set
+            is always written to the TSV).
+    """
+    return service.run_search(
+        data,
+        synteny_structure,
+        gene_ids=gene_ids,
+        unordered=unordered,
+        best_hmm_wins=best_hmm_wins,
+        reuse=reuse,
+        hmm_dir=hmm_dir,
+        hmm_meta=hmm_meta,
+        outdir=outdir,
+        prefix=prefix,
+        hmmsearch_args=hmmsearch_args,
+        processes=processes,
+        logfile=logfile,
+        max_hits=max_hits,
+    )
+
+
+@mcp.tool()
+def download_hmm_databases(
+    outdir: str,
+    pgap: bool = True,
+    pfam: bool = False,
+    unpack: bool = True,
+    force: bool = False,
+    logfile: Optional[str] = None,
+) -> dict:
+    """Download profile-HMM databases (PGAP from NCBI and/or PFAM-A) and register
+    them in Pynteny's config so later searches can find them automatically. This
+    downloads large files over the network and can take a while.
+
+    Args:
+        outdir: directory to download the HMM databases into.
+        pgap: download the PGAP database (HMMs + metadata). Default True.
+        pfam: download the PFAM-A database. Default False.
+        unpack: unpack the archives now (otherwise Pynteny unpacks per session).
+        force: re-download even if already present.
+        logfile: path to a log file.
+    """
+    return service.download_databases(
+        outdir,
+        pgap=pgap,
+        pfam=pfam,
+        unpack=unpack,
+        force=force,
+        logfile=logfile,
+    )
+
+
+def main() -> None:
+    """Console-script entry point: run the server over stdio."""
+    import logging
+    import os
+
+    # Quiet the per-request INFO chatter from the MCP runtime unless debugging.
+    if not os.environ.get("PYNTENY_MCP_DEBUG"):
+        logging.getLogger("mcp").setLevel(logging.WARNING)
+    mcp.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mcp/src/pynteny_mcp/service.py b/mcp/src/pynteny_mcp/service.py
new file mode 100644
index 0000000..07cbdb4
--- /dev/null
+++ b/mcp/src/pynteny_mcp/service.py
@@ -0,0 +1,430 @@
+"""
+Service layer for the Pynteny MCP server.
+
+Wraps the :mod:`pynteny` public API (``pynteny.api.Search / Build / Download``
+and the synteny-structure parsers) with helpers that:
+
+* keep the MCP **stdio channel clean** — Pynteny logs to ``stdout`` and several
+  helpers print there, which would corrupt the JSON-RPC stream, so every call
+  into Pynteny is wrapped in :func:`_pynteny_run` which redirects ``stdout`` to
+  ``stderr`` and isolates Pynteny's root-logger handlers, and
+* turn the rich return values (``SyntenyHits`` / pandas ``DataFrame`` / written
+  output files) into compact, JSON-serialisable summaries.
+
+Everything here is transport-agnostic; ``server.py`` only adds the MCP tool
+definitions on top.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any, Optional
+
+
+def _log(msg: str) -> None:
+    # NEVER write to stdout: it is the MCP stdio channel. Logs go to stderr.
+    print(f"[pynteny-mcp] {msg}", file=sys.stderr, flush=True)
+
+
+@contextlib.contextmanager
+def _pynteny_run():
+    """
+    Run a block of Pynteny code without polluting the MCP stdio channel.
+
+    Pynteny's ``init_logger`` calls ``logging.basicConfig`` with a
+    ``StreamHandler(sys.stdout)`` and several code paths ``print`` to stdout. The
+    MCP stdio transport uses stdout for protocol traffic, so any of that would
+    break the connection. We:
+
+    * redirect ``sys.stdout`` to ``sys.stderr`` for the duration of the call
+      (so the StreamHandler Pynteny installs binds to stderr, and stray prints
+      go to stderr too), and
+    * detach the root logger's existing handlers first and restore them after,
+      so repeated calls don't accumulate handlers or reuse ones Pynteny closed
+      via ``logging.shutdown()``.
+    """
+    root = logging.getLogger()
+    saved_handlers = root.handlers[:]
+    for h in saved_handlers:
+        root.removeHandler(h)
+    try:
+        with contextlib.redirect_stdout(sys.stderr):
+            yield
+    finally:
+        # Drop whatever handlers Pynteny installed (it may have closed them in
+        # logging.shutdown()), then restore the ones we started with.
+        for h in root.handlers[:]:
+            root.removeHandler(h)
+        for h in saved_handlers:
+            root.addHandler(h)
+
+
+# --------------------------------------------------------------------------- #
+# Config / metadata                                                           #
+# --------------------------------------------------------------------------- #
+def pynteny_info() -> dict[str, Any]:
+    """Version, author, citation and which HMM databases are configured."""
+    from pynteny.api import Command, __author__, __version__
+    from pynteny.utils import CommandArgs, ConfigParser
+    from pynteny.subcommands import get_citation
+
+    citation = get_citation(CommandArgs(version=__version__, author=__author__), silent=True)
+
+    config = ConfigParser.get_default_config()
+    db: dict[str, Any] = {}
+    for field in (
+        "PGAP_data_downloaded",
+        "PFAM_data_downloaded",
+        "PGAP_database",
+        "PGAP_meta_file",
+        "PFAM_database",
+        "PFAM_meta_file",
+        "database_dir",
+    ):
+        try:
+            db[field] = config.get_field(field)
+        except Exception:
+            db[field] = None
+    return {
+        "version": __version__,
+        "author": __author__,
+        "citation": citation,
+        "config_file": str(config.get_config_path()),
+        "hmm_databases": db,
+    }
+
+
+def _resolve_hmm_meta(hmm_meta: Optional[str]) -> Optional[str]:
+    """Fall back to the PGAP metadata file recorded in Pynteny's config."""
+    if hmm_meta:
+        return hmm_meta
+    from pynteny.utils import ConfigParser
+
+    config = ConfigParser.get_default_config()
+    for field in ("PGAP_meta_file", "PFAM_meta_file"):
+        try:
+            value = config.get_field(field)
+        except Exception:
+            value = None
+        if value:
+            return value
+    return None
+
+
+def _resolve_hmm_dir(hmm_dir: Optional[str]) -> Optional[str]:
+    """Fall back to the PGAP HMM directory recorded in Pynteny's config."""
+    if hmm_dir:
+        return hmm_dir
+    from pynteny.utils import ConfigParser
+
+    config = ConfigParser.get_default_config()
+    for field in ("PGAP_database", "PFAM_database"):
+        try:
+            value = config.get_field(field)
+        except Exception:
+            value = None
+        if value:
+            return value
+    return None
+
+
+# --------------------------------------------------------------------------- #
+# Synteny-structure parsing (no I/O)                                          #
+# --------------------------------------------------------------------------- #
+def validate_structure(synteny_structure: str) -> dict[str, Any]:
+    """Reformat, validate and decompose a synteny structure string.
+
+    Pure string work — no HMMs, files or searches involved. Several Pynteny
+    parser helpers call ``sys.exit`` on malformed input, so we trap
+    ``SystemExit`` and report ``valid=False`` instead of killing the server.
+    """
+    import pynteny.parsers.syntenyparser as sp
+
+    reformatted = sp.reformat_synteny_structure(synteny_structure)
+    result: dict[str, Any] = {
+        "input": synteny_structure,
+        "reformatted": reformatted,
+    }
+    try:
+        with _pynteny_run():
+            valid = sp.is_valid_structure(reformatted)
+            hmm_groups = sp.get_HMM_groups_in_structure(reformatted)
+            hmm_names = sp.get_all_HMMs_in_structure(reformatted)
+            strands = sp.get_strands_in_structure(reformatted)
+            distances = sp.get_maximum_distances_in_structure(reformatted)
+    except SystemExit:
+        result["valid"] = False
+        result["error"] = (
+            "Could not parse the structure. Expected something like "
+            "'>hmm_a 0 >hmm_b 3 <hmm_c' (strand '>'/'<', integers = max genes "
+            "between neighbours)."
+        )
+        return result
+    except Exception as exc:  # noqa: BLE001
+        result["valid"] = False
+        result["error"] = f"{type(exc).__name__}: {exc}"
+        return result
+
+    result.update(
+        {
+            "valid": bool(valid),
+            "n_genes": len(hmm_groups),
+            "hmm_groups": hmm_groups,
+            "hmm_names": hmm_names,
+            "strands": strands,
+            "max_distances": distances,
+            "contains_hmm_groups": sp.contains_HMM_groups(reformatted),
+        }
+    )
+    return result
+
+
+def parse_gene_symbols(synteny_structure: str, hmm_meta: Optional[str]) -> dict[str, Any]:
+    """Translate a *gene-symbol* synteny structure into one based on HMM names,
+    using a PGAP/PFAM metadata table."""
+    from pynteny.api import Search
+
+    resolved_meta = _resolve_hmm_meta(hmm_meta)
+    if not resolved_meta:
+        return {
+            "error": (
+                "No HMM metadata file available. Provide `hmm_meta`, or download "
+                "the PGAP database first with `download_hmm_databases`."
+            )
+        }
+    if not Path(resolved_meta).exists():
+        return {"error": f"HMM metadata file not found: {resolved_meta}"}
+
+    search = Search(data=".", synteny_struc=synteny_structure, hmm_meta=resolved_meta)
+    try:
+        with _pynteny_run():
+            translated = search.parse_genes(synteny_structure)
+    except SystemExit:
+        return {
+            "input": synteny_structure,
+            "hmm_meta": resolved_meta,
+            "error": (
+                "One or more gene symbols did not match an HMM in the metadata "
+                "table. Check the gene symbols and the metadata file."
+            ),
+        }
+    return {
+        "input": synteny_structure,
+        "hmm_meta": resolved_meta,
+        "translated_structure": translated,
+    }
+
+
+# --------------------------------------------------------------------------- #
+# Build                                                                       #
+# --------------------------------------------------------------------------- #
+def _count_fasta_records(path: Path) -> int:
+    n = 0
+    try:
+        with open(path, "r", encoding="utf-8", errors="ignore") as fh:
+            for line in fh:
+                if line.startswith(">"):
+                    n += 1
+    except OSError:
+        return -1
+    return n
+
+
+def build_database(
+    data: str,
+    *,
+    outfile: Optional[str],
+    prepend: bool,
+    processes: Optional[int],
+    tempdir: Optional[str],
+    logfile: Optional[str],
+) -> dict[str, Any]:
+    """Translate a nucleotide assembly (or GenBank file/dir) into a labelled
+    peptide database that synteny search can run on."""
+    from pynteny.api import Build
+
+    data_path = Path(data)
+    if not data_path.exists():
+        return {"error": f"Input data not found: {data}"}
+
+    build = Build(
+        data=data_path,
+        prepend=prepend,
+        outfile=outfile,
+        logfile=logfile,
+        processes=processes,
+        tempdir=tempdir,
+    )
+    with _pynteny_run():
+        build.run()
+
+    out = Path(build._args.outfile) if build._args.outfile else None
+    result: dict[str, Any] = {"input": str(data_path)}
+    if out is not None and out.exists():
+        result["output_file"] = str(out)
+        result["output_size_bytes"] = out.stat().st_size
+        result["n_peptides"] = _count_fasta_records(out)
+    else:
+        result["output_file"] = str(out) if out else None
+        result["warning"] = (
+            "Build completed but the output file could not be located; "
+            "check the logfile."
+        )
+    return result
+
+
+# --------------------------------------------------------------------------- #
+# Search                                                                      #
+# --------------------------------------------------------------------------- #
+def run_search(
+    data: str,
+    synteny_structure: str,
+    *,
+    gene_ids: bool,
+    unordered: bool,
+    best_hmm_wins: bool,
+    reuse: bool,
+    hmm_dir: Optional[str],
+    hmm_meta: Optional[str],
+    outdir: Optional[str],
+    prefix: str,
+    hmmsearch_args: Optional[str],
+    processes: Optional[int],
+    logfile: Optional[str],
+    max_hits: int = 200,
+) -> dict[str, Any]:
+    """Run a synteny-aware HMM search over a labelled peptide database and return
+    the matched hits plus the paths of the files Pynteny wrote."""
+    from pynteny.api import Search
+
+    data_path = Path(data)
+    if not data_path.exists():
+        return {"error": f"Sequence database not found: {data}"}
+
+    resolved_dir = _resolve_hmm_dir(hmm_dir)
+    if resolved_dir and not Path(resolved_dir).exists():
+        return {"error": f"HMM directory not found: {resolved_dir}"}
+    if resolved_dir is None:
+        return {
+            "error": (
+                "No HMM directory available. Provide `hmm_dir`, or download the "
+                "PGAP database first with `download_hmm_databases` (the search "
+                "would otherwise trigger a large download)."
+            )
+        }
+    resolved_meta = _resolve_hmm_meta(hmm_meta)
+
+    kwargs = dict(
+        data=data_path,
+        synteny_struc=synteny_structure,
+        gene_ids=gene_ids,
+        unordered=unordered,
+        reuse=reuse,
+        hmm_dir=resolved_dir,
+        hmm_meta=resolved_meta,
+        outdir=outdir,
+        prefix=prefix,
+        hmmsearch_args=hmmsearch_args,
+        logfile=logfile,
+        processes=processes,
+    )
+    # `best_hmm_wins` was added after the published 1.2.0 release; pass it only
+    # when the installed Pynteny supports it so this works on both.
+    import inspect
+
+    best_hmm_wins_supported = "best_hmm_wins" in inspect.signature(Search.__init__).parameters
+    if best_hmm_wins_supported:
+        kwargs["best_hmm_wins"] = best_hmm_wins
+
+    search = Search(**kwargs)
+
+    with _pynteny_run():
+        synteny_hits = search.run()
+        if resolved_meta and Path(resolved_meta).exists():
+            annotated = synteny_hits.add_HMM_meta_info_to_hits(resolved_meta)
+            # Depending on the Pynteny version this returns either a SyntenyHits
+            # or the underlying DataFrame directly.
+            df = annotated.hits if hasattr(annotated, "hits") else annotated
+        else:
+            df = synteny_hits.hits
+
+    out_dir = Path(search._args.outdir)
+    records = df.where(df.notna(), None).to_dict(orient="records")
+    fasta_files = sorted(str(p) for p in out_dir.glob(f"{prefix}*_hits.fasta"))
+    synteny_table = out_dir / f"{prefix}synteny_matched.tsv"
+
+    result: dict[str, Any] = {
+        "synteny_structure": search._args.synteny_struc,
+        "data": str(data_path),
+        "n_hits": int(len(df)),
+        "columns": list(df.columns),
+        "hits": records[:max_hits],
+        "hits_truncated": len(records) > max_hits,
+        "output_dir": str(out_dir),
+        "synteny_table": str(synteny_table) if synteny_table.exists() else None,
+        "fasta_files": fasta_files,
+    }
+    if best_hmm_wins and not best_hmm_wins_supported:
+        result["warning"] = (
+            "best_hmm_wins was requested but the installed Pynteny does not "
+            "support it; the option was ignored."
+        )
+    return result
+
+
+# --------------------------------------------------------------------------- #
+# Download                                                                    #
+# --------------------------------------------------------------------------- #
+def download_databases(
+    outdir: str,
+    *,
+    pgap: bool,
+    pfam: bool,
+    unpack: bool,
+    force: bool,
+    logfile: Optional[str],
+) -> dict[str, Any]:
+    """Download PGAP and/or PFAM profile-HMM databases from NCBI / InterPro.
+
+    The :class:`pynteny.api.Download` wrapper hard-wires PGAP only, so we drive
+    the underlying ``download_hmms`` subcommand directly to expose the PFAM
+    option too.
+    """
+    from pynteny.subcommands import download_hmms
+    from pynteny.utils import CommandArgs, ConfigParser
+
+    if not (pgap or pfam):
+        return {"error": "Select at least one of pgap / pfam to download."}
+
+    args = CommandArgs(
+        outdir=Path(outdir),
+        logfile=Path(logfile) if logfile else None,
+        force=force,
+        unpack=unpack,
+        pgap=pgap,
+        pfam=pfam,
+    )
+    try:
+        with _pynteny_run():
+            download_hmms(args)
+    except SystemExit:
+        # download_hmms exits(1) when the requested databases are already present.
+        pass
+
+    config = ConfigParser.get_default_config()
+    return {
+        "outdir": str(Path(outdir).absolute()),
+        "requested": {"pgap": pgap, "pfam": pfam},
+        "hmm_databases": {
+            "PGAP_data_downloaded": config.get_field("PGAP_data_downloaded"),
+            "PFAM_data_downloaded": config.get_field("PFAM_data_downloaded"),
+            "PGAP_database": config.get_field("PGAP_database"),
+            "PGAP_meta_file": config.get_field("PGAP_meta_file"),
+            "PFAM_database": config.get_field("PFAM_database"),
+            "PFAM_meta_file": config.get_field("PFAM_meta_file"),
+        },
+    }
diff --git a/mcp/tests/smoke_test.py b/mcp/tests/smoke_test.py
new file mode 100644
index 0000000..8810aab
--- /dev/null
+++ b/mcp/tests/smoke_test.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+"""
+No-API-key smoke test for the Pynteny MCP server.
+
+Launches the server over stdio, connects as an MCP client, and exercises the
+tools end to end against Pynteny's own committed test data
+(``tests/test_data/MG1655.fasta`` + the ``hmms/`` directory), asserting on the
+known synteny hits for the *leu* operon. No LLM and no API keys are involved —
+this validates the MCP plumbing and the tool logic, including that Pynteny's
+stdout logging does not corrupt the stdio protocol.
+
+    python tests/smoke_test.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import sys
+import tempfile
+from datetime import timedelta
+from pathlib import Path
+
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+
+HERE = Path(__file__).resolve().parent
+MCP_DIR = HERE.parent
+SRC_DIR = MCP_DIR / "src"
+REPO_ROOT = MCP_DIR.parent
+TEST_DATA = REPO_ROOT / "tests" / "test_data"
+
+# The leu-operon synteny structure used by Pynteny's own integration test.
+SYNTENY_STRUC = (
+    "<(TIGR00171.1|TIGR02084.1) 0 "
+    "<(TIGR00170.1|TIGR02083.1) 1 "
+    "<(TIGR00973.1|NF002084.0|TIGR00970.1)"
+)
+EXPECTED_LABELS = {
+    "b0071__U00096_71_78847_79453_neg",
+    "b0072__U00096_72_79463_80864_neg",
+    "b0074__U00096_74_81957_83529_neg",
+}
+
+TIMEOUT = timedelta(seconds=300)
+
+
+async def call(session: ClientSession, name: str, **arguments):
+    result = await session.call_tool(name, arguments, read_timeout_seconds=TIMEOUT)
+    assert not getattr(result, "isError", False), f"{name} returned an error: {result.content}"
+    text = "\n".join(getattr(b, "text", "") for b in result.content if getattr(b, "text", ""))
+    return json.loads(text)
+
+
+def check(label: str, condition: bool, detail: str = "") -> None:
+    status = "PASS" if condition else "FAIL"
+    print(f"  [{status}] {label}" + (f" — {detail}" if detail else ""))
+    if not condition:
+        raise AssertionError(f"{label}: {detail}")
+
+
+async def main() -> None:
+    data = TEST_DATA / "MG1655.fasta"
+    hmm_dir = TEST_DATA / "hmms"
+    hmm_meta = TEST_DATA / "hmm_meta.tsv"
+    for p in (data, hmm_dir, hmm_meta):
+        if not p.exists():
+            sys.exit(f"Test data not found: {p}")
+
+    server_params = StdioServerParameters(
+        command=sys.executable,
+        args=["-m", "pynteny_mcp.server"],
+        env={
+            "PYTHONPATH": str(SRC_DIR),
+            "PATH": os.environ.get("PATH", ""),
+        },
+    )
+
+    with tempfile.TemporaryDirectory() as outdir:
+        async with stdio_client(server_params) as (read, write):
+            async with ClientSession(read, write) as session:
+                await session.initialize()
+                tools = (await session.list_tools()).tools
+                print(f"Server exposes {len(tools)} tools: {', '.join(t.name for t in tools)}\n")
+                check("expected tool count", len(tools) == 6, f"got {len(tools)}")
+
+                info = await call(session, "get_pynteny_info")
+                check("pynteny version reported", bool(info.get("version")), str(info.get("version")))
+                check("citation present", "Pynteny" in info.get("citation", ""), "")
+
+                good = await call(session, "validate_synteny_structure",
+                                  synteny_structure=SYNTENY_STRUC)
+                check("valid structure recognised", good["valid"] is True, str(good.get("valid")))
+                check("structure has 3 genes", good["n_genes"] == 3, str(good.get("n_genes")))
+                check("max distances parsed", good["max_distances"] == [0, 1],
+                      str(good.get("max_distances")))
+                check("strands parsed", good["strands"] == ["neg", "neg", "neg"],
+                      str(good.get("strands")))
+
+                # Two adjacent genes with no distance token between them: the
+                # gene/distance counts don't line up, so this is malformed.
+                bad = await call(session, "validate_synteny_structure",
+                                 synteny_structure=">TIGR00171.1 >TIGR00170.1")
+                check("invalid structure rejected", bad["valid"] is False, str(bad.get("valid")))
+
+                hits = await call(
+                    session, "run_synteny_search",
+                    data=str(data),
+                    synteny_structure=SYNTENY_STRUC,
+                    hmm_dir=str(hmm_dir),
+                    hmm_meta=str(hmm_meta),
+                    reuse=True,
+                    outdir=outdir,
+                )
+                check("search returned 3 hits", hits["n_hits"] == 3, str(hits.get("n_hits")))
+                labels = {h.get("full_label") for h in hits["hits"]}
+                check("hit labels match expected leu operon", labels == EXPECTED_LABELS,
+                      str(labels))
+                check("hits annotated with gene_symbol", "gene_symbol" in hits["columns"],
+                      str(hits["columns"]))
+                check("wrote synteny_matched.tsv", bool(hits.get("synteny_table")),
+                      str(hits.get("synteny_table")))
+                check("wrote per-gene FASTA files", len(hits.get("fasta_files", [])) >= 1,
+                      str(hits.get("fasta_files")))
+
+                # best_hmm_wins: must still return the leu hits. On a Pynteny old
+                # enough to lack the option the service reports a `warning` and
+                # carries on; on a new-enough Pynteny it is honored (no warning).
+                bhw = await call(
+                    session, "run_synteny_search",
+                    data=str(data),
+                    synteny_structure=SYNTENY_STRUC,
+                    hmm_dir=str(hmm_dir),
+                    hmm_meta=str(hmm_meta),
+                    reuse=True,
+                    best_hmm_wins=True,
+                    outdir=outdir,
+                )
+                check("best_hmm_wins search still returns 3 hits", bhw["n_hits"] == 3,
+                      str(bhw.get("n_hits")))
+                if bhw.get("warning"):
+                    print(f"  [NOTE] best_hmm_wins not honored by installed Pynteny: "
+                          f"{bhw['warning']}")
+                else:
+                    print("  [NOTE] best_hmm_wins honored by installed Pynteny.")
+
+    print("\nAll smoke-test checks passed.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 17c1a98c83296b01c6de74841131b61a2e9c06d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?=
 <semidan.robaina@gmail.com>
Date: Fri, 19 Jun 2026 14:02:30 +0100
Subject: [PATCH 2/2] Format mcp/ with black to satisfy CI quality check

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 mcp/examples/synteny_search_agent.py |  30 +++++--
 mcp/src/pynteny_mcp/service.py       |  12 ++-
 mcp/tests/smoke_test.py              | 117 ++++++++++++++++++++-------
 3 files changed, 119 insertions(+), 40 deletions(-)

diff --git a/mcp/examples/synteny_search_agent.py b/mcp/examples/synteny_search_agent.py
index 831e308..c3dfb9d 100644
--- a/mcp/examples/synteny_search_agent.py
+++ b/mcp/examples/synteny_search_agent.py
@@ -98,7 +98,9 @@ async def call(self, name: str, arguments: dict) -> str:
         result = await self.session.call_tool(
             name, arguments, read_timeout_seconds=TOOL_TIMEOUT
         )
-        parts = [getattr(b, "text", "") for b in result.content if getattr(b, "text", "")]
+        parts = [
+            getattr(b, "text", "") for b in result.content if getattr(b, "text", "")
+        ]
         text = "\n".join(parts).strip()
         if not text and getattr(result, "structuredContent", None):
             text = json.dumps(result.structuredContent)
@@ -108,7 +110,11 @@ async def call(self, name: str, arguments: dict) -> str:
 
     def anthropic_tools(self) -> list[dict]:
         return [
-            {"name": t.name, "description": t.description or "", "input_schema": t.inputSchema}
+            {
+                "name": t.name,
+                "description": t.description or "",
+                "input_schema": t.inputSchema,
+            }
             for t in self.tools
         ]
 
@@ -210,7 +216,10 @@ async def run_deepseek(toolbox: MCPToolbox, question: str) -> str:
                 {
                     "id": tc.id,
                     "type": "function",
-                    "function": {"name": tc.function.name, "arguments": tc.function.arguments},
+                    "function": {
+                        "name": tc.function.name,
+                        "arguments": tc.function.arguments,
+                    },
                 }
                 for tc in msg.tool_calls
             ]
@@ -249,7 +258,9 @@ async def main_async(provider: str, question: str) -> None:
             toolbox = MCPToolbox(session)
             await toolbox.load()
 
-            print(f"Connected to Pynteny MCP server: {len(toolbox.tools)} tools available")
+            print(
+                f"Connected to Pynteny MCP server: {len(toolbox.tools)} tools available"
+            )
             print(f"Provider: {provider}\n")
             print(f"Question:\n  {question}\n")
             print("--- agent trace ---")
@@ -259,7 +270,9 @@ async def main_async(provider: str, question: str) -> None:
                     answer = await run_claude(toolbox, question)
                 else:
                     answer = await run_deepseek(toolbox, question)
-            except Exception as exc:  # noqa: BLE001 — surface a clean message, not a stack trace
+            except (
+                Exception
+            ) as exc:  # noqa: BLE001 — surface a clean message, not a stack trace
                 print(f"\nLLM call failed ({type(exc).__name__}): {exc}")
                 return
 
@@ -270,11 +283,14 @@ async def main_async(provider: str, question: str) -> None:
 def main() -> None:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
-        "--provider", choices=["claude", "deepseek"], default="claude",
+        "--provider",
+        choices=["claude", "deepseek"],
+        default="claude",
         help="Which LLM backend to use (default: claude).",
     )
     parser.add_argument(
-        "--question", default=DEFAULT_QUESTION,
+        "--question",
+        default=DEFAULT_QUESTION,
         help="Question to ask the agent (defaults to the leu-operon example).",
     )
     args = parser.parse_args()
diff --git a/mcp/src/pynteny_mcp/service.py b/mcp/src/pynteny_mcp/service.py
index 07cbdb4..3d2d37b 100644
--- a/mcp/src/pynteny_mcp/service.py
+++ b/mcp/src/pynteny_mcp/service.py
@@ -72,7 +72,9 @@ def pynteny_info() -> dict[str, Any]:
     from pynteny.utils import CommandArgs, ConfigParser
     from pynteny.subcommands import get_citation
 
-    citation = get_citation(CommandArgs(version=__version__, author=__author__), silent=True)
+    citation = get_citation(
+        CommandArgs(version=__version__, author=__author__), silent=True
+    )
 
     config = ConfigParser.get_default_config()
     db: dict[str, Any] = {}
@@ -183,7 +185,9 @@ def validate_structure(synteny_structure: str) -> dict[str, Any]:
     return result
 
 
-def parse_gene_symbols(synteny_structure: str, hmm_meta: Optional[str]) -> dict[str, Any]:
+def parse_gene_symbols(
+    synteny_structure: str, hmm_meta: Optional[str]
+) -> dict[str, Any]:
     """Translate a *gene-symbol* synteny structure into one based on HMM names,
     using a PGAP/PFAM metadata table."""
     from pynteny.api import Search
@@ -336,7 +340,9 @@ def run_search(
     # when the installed Pynteny supports it so this works on both.
     import inspect
 
-    best_hmm_wins_supported = "best_hmm_wins" in inspect.signature(Search.__init__).parameters
+    best_hmm_wins_supported = (
+        "best_hmm_wins" in inspect.signature(Search.__init__).parameters
+    )
     if best_hmm_wins_supported:
         kwargs["best_hmm_wins"] = best_hmm_wins
 
diff --git a/mcp/tests/smoke_test.py b/mcp/tests/smoke_test.py
index 8810aab..6ef9341 100644
--- a/mcp/tests/smoke_test.py
+++ b/mcp/tests/smoke_test.py
@@ -48,8 +48,12 @@
 
 async def call(session: ClientSession, name: str, **arguments):
     result = await session.call_tool(name, arguments, read_timeout_seconds=TIMEOUT)
-    assert not getattr(result, "isError", False), f"{name} returned an error: {result.content}"
-    text = "\n".join(getattr(b, "text", "") for b in result.content if getattr(b, "text", ""))
+    assert not getattr(
+        result, "isError", False
+    ), f"{name} returned an error: {result.content}"
+    text = "\n".join(
+        getattr(b, "text", "") for b in result.content if getattr(b, "text", "")
+    )
     return json.loads(text)
 
 
@@ -82,30 +86,61 @@ async def main() -> None:
             async with ClientSession(read, write) as session:
                 await session.initialize()
                 tools = (await session.list_tools()).tools
-                print(f"Server exposes {len(tools)} tools: {', '.join(t.name for t in tools)}\n")
+                print(
+                    f"Server exposes {len(tools)} tools: {', '.join(t.name for t in tools)}\n"
+                )
                 check("expected tool count", len(tools) == 6, f"got {len(tools)}")
 
                 info = await call(session, "get_pynteny_info")
-                check("pynteny version reported", bool(info.get("version")), str(info.get("version")))
+                check(
+                    "pynteny version reported",
+                    bool(info.get("version")),
+                    str(info.get("version")),
+                )
                 check("citation present", "Pynteny" in info.get("citation", ""), "")
 
-                good = await call(session, "validate_synteny_structure",
-                                  synteny_structure=SYNTENY_STRUC)
-                check("valid structure recognised", good["valid"] is True, str(good.get("valid")))
-                check("structure has 3 genes", good["n_genes"] == 3, str(good.get("n_genes")))
-                check("max distances parsed", good["max_distances"] == [0, 1],
-                      str(good.get("max_distances")))
-                check("strands parsed", good["strands"] == ["neg", "neg", "neg"],
-                      str(good.get("strands")))
+                good = await call(
+                    session,
+                    "validate_synteny_structure",
+                    synteny_structure=SYNTENY_STRUC,
+                )
+                check(
+                    "valid structure recognised",
+                    good["valid"] is True,
+                    str(good.get("valid")),
+                )
+                check(
+                    "structure has 3 genes",
+                    good["n_genes"] == 3,
+                    str(good.get("n_genes")),
+                )
+                check(
+                    "max distances parsed",
+                    good["max_distances"] == [0, 1],
+                    str(good.get("max_distances")),
+                )
+                check(
+                    "strands parsed",
+                    good["strands"] == ["neg", "neg", "neg"],
+                    str(good.get("strands")),
+                )
 
                 # Two adjacent genes with no distance token between them: the
                 # gene/distance counts don't line up, so this is malformed.
-                bad = await call(session, "validate_synteny_structure",
-                                 synteny_structure=">TIGR00171.1 >TIGR00170.1")
-                check("invalid structure rejected", bad["valid"] is False, str(bad.get("valid")))
+                bad = await call(
+                    session,
+                    "validate_synteny_structure",
+                    synteny_structure=">TIGR00171.1 >TIGR00170.1",
+                )
+                check(
+                    "invalid structure rejected",
+                    bad["valid"] is False,
+                    str(bad.get("valid")),
+                )
 
                 hits = await call(
-                    session, "run_synteny_search",
+                    session,
+                    "run_synteny_search",
                     data=str(data),
                     synteny_structure=SYNTENY_STRUC,
                     hmm_dir=str(hmm_dir),
@@ -113,22 +148,39 @@ async def main() -> None:
                     reuse=True,
                     outdir=outdir,
                 )
-                check("search returned 3 hits", hits["n_hits"] == 3, str(hits.get("n_hits")))
+                check(
+                    "search returned 3 hits",
+                    hits["n_hits"] == 3,
+                    str(hits.get("n_hits")),
+                )
                 labels = {h.get("full_label") for h in hits["hits"]}
-                check("hit labels match expected leu operon", labels == EXPECTED_LABELS,
-                      str(labels))
-                check("hits annotated with gene_symbol", "gene_symbol" in hits["columns"],
-                      str(hits["columns"]))
-                check("wrote synteny_matched.tsv", bool(hits.get("synteny_table")),
-                      str(hits.get("synteny_table")))
-                check("wrote per-gene FASTA files", len(hits.get("fasta_files", [])) >= 1,
-                      str(hits.get("fasta_files")))
+                check(
+                    "hit labels match expected leu operon",
+                    labels == EXPECTED_LABELS,
+                    str(labels),
+                )
+                check(
+                    "hits annotated with gene_symbol",
+                    "gene_symbol" in hits["columns"],
+                    str(hits["columns"]),
+                )
+                check(
+                    "wrote synteny_matched.tsv",
+                    bool(hits.get("synteny_table")),
+                    str(hits.get("synteny_table")),
+                )
+                check(
+                    "wrote per-gene FASTA files",
+                    len(hits.get("fasta_files", [])) >= 1,
+                    str(hits.get("fasta_files")),
+                )
 
                 # best_hmm_wins: must still return the leu hits. On a Pynteny old
                 # enough to lack the option the service reports a `warning` and
                 # carries on; on a new-enough Pynteny it is honored (no warning).
                 bhw = await call(
-                    session, "run_synteny_search",
+                    session,
+                    "run_synteny_search",
                     data=str(data),
                     synteny_structure=SYNTENY_STRUC,
                     hmm_dir=str(hmm_dir),
@@ -137,11 +189,16 @@ async def main() -> None:
                     best_hmm_wins=True,
                     outdir=outdir,
                 )
-                check("best_hmm_wins search still returns 3 hits", bhw["n_hits"] == 3,
-                      str(bhw.get("n_hits")))
+                check(
+                    "best_hmm_wins search still returns 3 hits",
+                    bhw["n_hits"] == 3,
+                    str(bhw.get("n_hits")),
+                )
                 if bhw.get("warning"):
-                    print(f"  [NOTE] best_hmm_wins not honored by installed Pynteny: "
-                          f"{bhw['warning']}")
+                    print(
+                        f"  [NOTE] best_hmm_wins not honored by installed Pynteny: "
+                        f"{bhw['warning']}"
+                    )
                 else:
                     print("  [NOTE] best_hmm_wins honored by installed Pynteny.")