From 6df62a7f4ecf1d0bcb3fcddc87fd1e2b61dc9c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?= Date: Fri, 19 Jun 2026 14:00:05 +0100 Subject: [PATCH 1/2] Add Pynteny MCP server (mcp/) for LLM agents Adds an MCP (Model Context Protocol) server under mcp/ that exposes the Pynteny API as 6 tools (info, validate/parse synteny structure, build, search, download) so MCP-compatible agents (Claude, DeepSeek, etc.) can drive Pynteny in natural language. Includes a no-API-key smoke test and a Claude+DeepSeek example agent. This lives in the repo but is intentionally kept out of the pynteny PyPI package (packaging includes only src/pynteny), so it does not affect the distribution. Co-Authored-By: Claude Opus 4.8 --- mcp/.env.example | 20 ++ mcp/.gitignore | 13 + mcp/README.md | 174 +++++++++++ mcp/examples/synteny_search_agent.py | 290 ++++++++++++++++++ mcp/requirements.txt | 17 ++ mcp/src/pynteny_mcp/__init__.py | 3 + mcp/src/pynteny_mcp/server.py | 249 ++++++++++++++++ mcp/src/pynteny_mcp/service.py | 430 +++++++++++++++++++++++++++ mcp/tests/smoke_test.py | 152 ++++++++++ 9 files changed, 1348 insertions(+) create mode 100644 mcp/.env.example create mode 100644 mcp/.gitignore create mode 100644 mcp/README.md create mode 100644 mcp/examples/synteny_search_agent.py create mode 100644 mcp/requirements.txt create mode 100644 mcp/src/pynteny_mcp/__init__.py create mode 100644 mcp/src/pynteny_mcp/server.py create mode 100644 mcp/src/pynteny_mcp/service.py create mode 100644 mcp/tests/smoke_test.py diff --git a/mcp/.env.example b/mcp/.env.example new file mode 100644 index 0000000..95a0c04 --- /dev/null +++ b/mcp/.env.example @@ -0,0 +1,20 @@ +# Copy this file to `.env` and fill in your keys. `.env` is gitignored. +# Used by examples/synteny_search_agent.py (the MCP server itself needs no keys). + +# --- Claude (Anthropic) --- +ANTHROPIC_API_KEY=sk-ant-... +ANTHROPIC_MODEL=claude-opus-4-8 + +# --- DeepSeek (OpenAI-compatible API) --- +DEEPSEEK_API_KEY=sk-... +DEEPSEEK_BASE_URL=https://api.deepseek.com +DEEPSEEK_MODEL=deepseek-v4-pro + +# --- Optional Pynteny defaults for the example agent --- +# These let the agent run searches without you re-specifying paths each time. +# A labelled peptide database (output of `build_peptide_database`) to search: +# PYNTENY_DATA=/abs/path/to/labelled_peptides.faa +# Directory of HMM files (e.g. the downloaded PGAP database): +# PYNTENY_HMM_DIR=/abs/path/to/data/hmms +# HMM metadata TSV (PGAP/PFAM), needed for gene-symbol searches: +# PYNTENY_HMM_META=/abs/path/to/data/hmms/hmm_PGAP.tsv diff --git a/mcp/.gitignore b/mcp/.gitignore new file mode 100644 index 0000000..2fd5649 --- /dev/null +++ b/mcp/.gitignore @@ -0,0 +1,13 @@ +# Secrets — never commit real API keys +.env +.env.local +*.env +!.env.example + +# Python build / cache +__pycache__/ +*.py[cod] +*.egg-info/ +build/ +dist/ +.venv/ diff --git a/mcp/README.md b/mcp/README.md new file mode 100644 index 0000000..e671696 --- /dev/null +++ b/mcp/README.md @@ -0,0 +1,174 @@ +# Pynteny MCP server + +A [Model Context Protocol](https://modelcontextprotocol.io) server that exposes +the [Pynteny](../) API as tools, so any MCP-compatible agent — Claude Desktop, +Claude Code, or a custom client driving Claude / DeepSeek / etc. — can run +synteny-aware HMM searches over genomic sequence data in natural language. + +It wraps `pynteny.api` (`Search` / `Build` / `Download`) and the synteny-structure +parsers, returning **compact JSON summaries** (the matched hits table and the +paths of the files Pynteny writes) instead of dumping large objects. + +``` +mcp/ +├── pyproject.toml # installable package: `pynteny-mcp` +├── requirements.txt # server + example deps +├── .env.example # copy to .env and fill in keys (gitignored) +├── src/pynteny_mcp/ +│ ├── server.py # FastMCP server + tool definitions +│ └── service.py # API wrappers, stdout-safe logging, JSON summaries +├── examples/ +│ └── synteny_search_agent.py # LLM agent (Claude + DeepSeek) demo +└── tests/ + └── smoke_test.py # no-API-key end-to-end check against the test data +``` + +## What is a synteny structure? + +A synteny structure describes a target gene arrangement, e.g. + +``` +>leuD 0 >leuC 1 `/`<` gives the **strand** (sense / antisense), and the integers are +the **maximum number of (untargeted) genes** allowed between neighbours. Groups of +interchangeable HMMs for one gene are written `(HMM_A|HMM_B)`. + +## Tools + +| Tool | What it does | +|------|--------------| +| `get_pynteny_info` | Pynteny version, citation, and which HMM databases (PGAP/PFAM) are downloaded | +| `validate_synteny_structure` | Validate & decompose a structure (HMMs, strands, distances) — no I/O | +| `parse_gene_symbols` | Translate a gene-symbol structure into HMM names via a metadata table | +| `build_peptide_database` | Predict ORFs and label them from a nucleotide assembly / GenBank | +| `run_synteny_search` | **Core:** run the synteny-aware HMM search; return matched hits + output paths (incl. `best_hmm_wins` for paralog cross-hits) | +| `download_hmm_databases` | Download the PGAP and/or PFAM profile-HMM databases | + +## Prerequisites + +A Python ≥ 3.8 environment with **Pynteny** installed. Pynteny is now a +pure-Python pip package — HMMER and Prodigal come bundled via +[pyhmmer](https://github.com/althonos/pyhmmer) and +[pyrodigal](https://github.com/althonos/pyrodigal), so **no conda environment or +external binaries are required**: + +```bash +pip install pynteny # or: pip install git+https://github.com/Robaina/Pynteny.git +``` + +## Install + +Install the server (and example) dependencies into that same environment: + +```bash +cd mcp +pip install -r requirements.txt # mcp, pynteny, anthropic, openai, python-dotenv +# optional — register the `pynteny-mcp` console script: +pip install -e . +``` + +## Configure + +Only the example agent needs API keys; the server itself does not. + +```bash +cp .env.example .env # .env is gitignored +``` + +Edit `.env`: + +```ini +ANTHROPIC_API_KEY=sk-ant-... +DEEPSEEK_API_KEY=sk-... +# optional defaults so the agent need not repeat paths: +# PYNTENY_DATA=/abs/path/to/labelled_peptides.faa +# PYNTENY_HMM_DIR=/abs/path/to/data/hmms +# PYNTENY_HMM_META=/abs/path/to/data/hmms/hmm_PGAP.tsv +``` + +## Verify (no API keys needed) + +Runs the server against Pynteny's committed test genome and asserts the known +*leu*-operon synteny hits: + +```bash +python tests/smoke_test.py +# → "All smoke-test checks passed." +``` + +## Run the example agent + +An LLM decides how to validate the structure and run the search to answer a +question. By default it searches Pynteny's test genome +(`../tests/test_data/MG1655.fasta`), so it works with no extra data; set +`PYNTENY_DATA` / `PYNTENY_HMM_DIR` / `PYNTENY_HMM_META` to search your own. + +```bash +python examples/synteny_search_agent.py --provider claude +python examples/synteny_search_agent.py --provider deepseek +python examples/synteny_search_agent.py --provider claude \ + --question "Is leuD-leuC-leuA syntenic in this genome, and on which strand?" +``` + +The example launches the MCP server itself (as a stdio subprocess using the same +Python interpreter), connects as an MCP client, converts the MCP tool schemas to +each provider's tool format, and runs a manual tool-use loop. + +> **Models.** Claude defaults to `claude-opus-4-8` with adaptive thinking; +> DeepSeek defaults to `deepseek-v4-pro`. Override via `ANTHROPIC_MODEL` / +> `DEEPSEEK_MODEL` in `.env`. + +## A typical end-to-end workflow + +For real data (not the bundled test genome) the agent (or you) would: + +1. `download_hmm_databases(outdir="data/hmms")` — once, to fetch PGAP HMMs + metadata. +2. `build_peptide_database(data="assembly.fa", outfile="labelled_peptides.faa")` — predict & label ORFs. +3. `validate_synteny_structure(">leuD 0 >leuC 1 leuD 0 >leuC 1 None: + self.tools = (await self.session.list_tools()).tools + + async def call(self, name: str, arguments: dict) -> str: + result = await self.session.call_tool( + name, arguments, read_timeout_seconds=TOOL_TIMEOUT + ) + parts = [getattr(b, "text", "") for b in result.content if getattr(b, "text", "")] + text = "\n".join(parts).strip() + if not text and getattr(result, "structuredContent", None): + text = json.dumps(result.structuredContent) + if getattr(result, "isError", False): + return f"ERROR: {text or '(unknown tool error)'}" + return text or "(no content)" + + def anthropic_tools(self) -> list[dict]: + return [ + {"name": t.name, "description": t.description or "", "input_schema": t.inputSchema} + for t in self.tools + ] + + def openai_tools(self) -> list[dict]: + return [ + { + "type": "function", + "function": { + "name": t.name, + "description": t.description or "", + "parameters": t.inputSchema, + }, + } + for t in self.tools + ] + + +def _log_tool_call(name: str, arguments: dict) -> None: + print(f" → tool: {name}({json.dumps(arguments, ensure_ascii=False)})") + + +# --------------------------------------------------------------------------- # +# Claude backend (official anthropic SDK, manual agentic loop) # +# --------------------------------------------------------------------------- # +async def run_claude(toolbox: MCPToolbox, question: str) -> str: + from anthropic import AsyncAnthropic + + model = os.environ.get("ANTHROPIC_MODEL", "claude-opus-4-8") + effort = os.environ.get("ANTHROPIC_EFFORT", "medium") + client = AsyncAnthropic() # reads ANTHROPIC_API_KEY from the environment + tools = toolbox.anthropic_tools() + messages: list[dict] = [{"role": "user", "content": question}] + + for _ in range(MAX_TURNS): + resp = await client.messages.create( + model=model, + max_tokens=16000, + system=SYSTEM_PROMPT, + tools=tools, + # Adaptive thinking lets Claude decide how much to reason between + # tool calls; effort trades thoroughness against token cost. + thinking={"type": "adaptive", "display": "summarized"}, + output_config={"effort": effort}, + messages=messages, + ) + + for block in resp.content: + if block.type == "thinking" and getattr(block, "thinking", ""): + print(f" [thinking] {block.thinking.strip()[:300]}") + elif block.type == "text" and block.text.strip(): + print(f" [claude] {block.text.strip()}") + + if resp.stop_reason != "tool_use": + return "".join(b.text for b in resp.content if b.type == "text").strip() + + messages.append({"role": "assistant", "content": resp.content}) + tool_results = [] + for block in resp.content: + if block.type == "tool_use": + _log_tool_call(block.name, block.input) + output = await toolbox.call(block.name, block.input) + tool_results.append( + {"type": "tool_result", "tool_use_id": block.id, "content": output} + ) + messages.append({"role": "user", "content": tool_results}) + + return "(stopped: reached the maximum number of tool-use turns)" + + +# --------------------------------------------------------------------------- # +# DeepSeek backend (OpenAI-compatible API, manual function-calling loop) # +# --------------------------------------------------------------------------- # +async def run_deepseek(toolbox: MCPToolbox, question: str) -> str: + from openai import AsyncOpenAI + + model = os.environ.get("DEEPSEEK_MODEL", "deepseek-v4-pro") + client = AsyncOpenAI( + api_key=os.environ["DEEPSEEK_API_KEY"], + base_url=os.environ.get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"), + ) + tools = toolbox.openai_tools() + messages: list[dict] = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": question}, + ] + + for _ in range(MAX_TURNS): + resp = await client.chat.completions.create( + model=model, messages=messages, tools=tools + ) + msg = resp.choices[0].message + + if msg.content and msg.content.strip(): + print(f" [deepseek] {msg.content.strip()}") + + assistant: dict = {"role": "assistant", "content": msg.content or ""} + if msg.tool_calls: + assistant["tool_calls"] = [ + { + "id": tc.id, + "type": "function", + "function": {"name": tc.function.name, "arguments": tc.function.arguments}, + } + for tc in msg.tool_calls + ] + messages.append(assistant) + + if not msg.tool_calls: + return (msg.content or "").strip() + + for tc in msg.tool_calls: + try: + arguments = json.loads(tc.function.arguments or "{}") + except json.JSONDecodeError: + arguments = {} + _log_tool_call(tc.function.name, arguments) + output = await toolbox.call(tc.function.name, arguments) + messages.append({"role": "tool", "tool_call_id": tc.id, "content": output}) + + return "(stopped: reached the maximum number of tool-use turns)" + + +# --------------------------------------------------------------------------- # +# Driver # +# --------------------------------------------------------------------------- # +async def main_async(provider: str, question: str) -> None: + # Launch the MCP server as a subprocess, using the *same* interpreter (so it + # shares this environment's `pynteny` + `mcp`), with src/ on PYTHONPATH. + server_params = StdioServerParameters( + command=sys.executable, + args=["-m", "pynteny_mcp.server"], + env={**os.environ, "PYTHONPATH": str(SRC_DIR)}, + ) + + async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + toolbox = MCPToolbox(session) + await toolbox.load() + + print(f"Connected to Pynteny MCP server: {len(toolbox.tools)} tools available") + print(f"Provider: {provider}\n") + print(f"Question:\n {question}\n") + print("--- agent trace ---") + + try: + if provider == "claude": + answer = await run_claude(toolbox, question) + else: + answer = await run_deepseek(toolbox, question) + except Exception as exc: # noqa: BLE001 — surface a clean message, not a stack trace + print(f"\nLLM call failed ({type(exc).__name__}): {exc}") + return + + print("\n--- final answer ---") + print(answer) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--provider", choices=["claude", "deepseek"], default="claude", + help="Which LLM backend to use (default: claude).", + ) + parser.add_argument( + "--question", default=DEFAULT_QUESTION, + help="Question to ask the agent (defaults to the leu-operon example).", + ) + args = parser.parse_args() + + key_var = "ANTHROPIC_API_KEY" if args.provider == "claude" else "DEEPSEEK_API_KEY" + if not os.environ.get(key_var): + sys.exit(f"{key_var} is not set. Add it to {ENV_PATH} (see .env.example).") + + asyncio.run(main_async(args.provider, args.question)) + + +if __name__ == "__main__": + main() diff --git a/mcp/requirements.txt b/mcp/requirements.txt new file mode 100644 index 0000000..a8e4d5e --- /dev/null +++ b/mcp/requirements.txt @@ -0,0 +1,17 @@ +# Runtime dependencies for the Pynteny MCP server + the example LLM agent. +# Install into the same environment that has `pynteny` (a pure-Python pip +# package — no conda or external binaries required): +# +# pip install pynteny # if not already installed +# pip install -r requirements.txt +# +# The server itself only needs `mcp` and `pynteny`; anthropic/openai/ +# python-dotenv are only used by examples/synteny_search_agent.py. + +mcp>=1.2.0 +pynteny>=1.3.0 + +# --- example agent only --- +anthropic>=0.40 +openai>=1.40 +python-dotenv>=1.0 diff --git a/mcp/src/pynteny_mcp/__init__.py b/mcp/src/pynteny_mcp/__init__.py new file mode 100644 index 0000000..7fa0266 --- /dev/null +++ b/mcp/src/pynteny_mcp/__init__.py @@ -0,0 +1,3 @@ +"""Pynteny MCP server — expose the Pynteny API to MCP-compatible LLM agents.""" + +__version__ = "0.1.0" diff --git a/mcp/src/pynteny_mcp/server.py b/mcp/src/pynteny_mcp/server.py new file mode 100644 index 0000000..9080da9 --- /dev/null +++ b/mcp/src/pynteny_mcp/server.py @@ -0,0 +1,249 @@ +""" +Pynteny MCP server. + +Exposes the Pynteny API as Model Context Protocol tools so that any +MCP-compatible agent (Claude Desktop, Claude Code, or a custom client driving +Claude / DeepSeek / etc.) can run synteny-aware HMM searches over genomic +sequence data in natural language. + +Pynteny (https://github.com/Robaina/Pynteny) finds synteny blocks in +(prokaryotic) sequence data: it searches a labelled peptide database with +profile HMMs (via HMMER / pyhmmer) and keeps only the hits whose genomic +arrangement — order, strand and gene spacing — matches a user-supplied synteny +structure such as ``>leuD 0 >leuC 1 leuD 0 >leuC 1 "/"<" is the strand (sense / antisense), and the integers are the +maximum number of (untargeted) genes allowed between neighbours. Groups of +interchangeable HMMs for one gene are written "(HMM_A|HMM_B)". + +Typical workflow: + 1. `get_pynteny_info` — confirm the install and see which HMM databases + (PGAP / PFAM) are already downloaded. + 2. `validate_synteny_structure` — cheaply check/parse a structure before a run. + 3. (once) `download_hmm_databases` — fetch the PGAP HMMs + metadata, unless you + already have an `hmm_dir`. + 4. `build_peptide_database` — turn a nucleotide assembly / GenBank file into the + labelled peptide FASTA that search consumes (skip if you already have one). + 5. `run_synteny_search` — the core tool: returns the matched hits table and the + paths of the FASTA / TSV files Pynteny writes. + +Use `parse_gene_symbols` to translate a gene-symbol structure into HMM names +against a metadata file. Searching by gene symbol directly is done by passing +gene_ids=True to `run_synteny_search`. +""" + +mcp = FastMCP("pynteny", instructions=INSTRUCTIONS) + + +@mcp.tool() +def get_pynteny_info() -> dict: + """Return the installed Pynteny version, author, citation, and which HMM + databases (PGAP / PFAM) are currently downloaded and configured. A good + first call to confirm the server is wired up and to decide whether you need + to download HMMs before searching.""" + return service.pynteny_info() + + +@mcp.tool() +def validate_synteny_structure(synteny_structure: str) -> dict: + """Validate and decompose a synteny structure string without running any + search or touching any file. Returns whether it is well-formed plus its + parsed parts: the HMM groups, the flat list of HMM names, the strand of each + gene, and the max-distance constraints between neighbours. Use this to catch + formatting mistakes before an expensive `run_synteny_search`. + + Args: + synteny_structure: e.g. ">leuD 0 >leuC 1 "/"<" give the strand, integers are the maximum + number of genes allowed between neighbours, and "(A|B)" groups + interchangeable HMMs for one gene. + """ + return service.validate_structure(synteny_structure) + + +@mcp.tool() +def parse_gene_symbols(synteny_structure: str, hmm_meta: Optional[str] = None) -> dict: + """Translate a synteny structure written with *gene symbols* (e.g. "leuD") + into one written with the corresponding *HMM names* (e.g. "TIGR00171.1"), + using a PGAP/PFAM metadata table. + + Args: + synteny_structure: gene-symbol structure, e.g. ">leuD 0 >leuC 1 dict: + """Build a labelled peptide database from a nucleotide assembly (or a GenBank + file / directory). ORFs are predicted (Prodigal/pyrodigal) and each peptide + is labelled with its contig and gene-location info so that synteny search can + reason about gene order and strand. Returns the output file path and the + number of peptides written. + + Args: + data: path to the input assembly FASTA, GenBank file, or a directory of + such files. + outfile: path for the labelled peptide FASTA. Defaults to a name derived + from `data`. + prepend: when `data` is a directory, prepend each file's name to its + sequence IDs (keeps IDs unique across genomes). + processes: max worker processes (defaults to all CPUs minus one). + tempdir: directory for temporary files. + logfile: path to a log file (logs otherwise go to stderr/devnull). + """ + return service.build_database( + data, + outfile=outfile, + prepend=prepend, + processes=processes, + tempdir=tempdir, + logfile=logfile, + ) + + +@mcp.tool() +def run_synteny_search( + data: str, + synteny_structure: str, + hmm_dir: Optional[str] = None, + hmm_meta: Optional[str] = None, + gene_ids: bool = False, + unordered: bool = False, + best_hmm_wins: bool = False, + reuse: bool = False, + outdir: Optional[str] = None, + prefix: str = "", + hmmsearch_args: Optional[str] = None, + processes: Optional[int] = None, + logfile: Optional[str] = None, + max_hits: int = 200, +) -> dict: + """Search a labelled peptide database for genes arranged in the given synteny + structure. This is the core Pynteny operation. It runs HMMER for each HMM, + then keeps only hits whose genomic context (order, strand, spacing) matches + the structure. Returns the matched hits as records, plus the paths of the + per-gene FASTA files and the `synteny_matched.tsv` table Pynteny writes. + + Args: + data: path to the labelled peptide database (output of + `build_peptide_database`). + synteny_structure: target arrangement, e.g. ">leuD 0 >leuC 1 dict: + """Download profile-HMM databases (PGAP from NCBI and/or PFAM-A) and register + them in Pynteny's config so later searches can find them automatically. This + downloads large files over the network and can take a while. + + Args: + outdir: directory to download the HMM databases into. + pgap: download the PGAP database (HMMs + metadata). Default True. + pfam: download the PFAM-A database. Default False. + unpack: unpack the archives now (otherwise Pynteny unpacks per session). + force: re-download even if already present. + logfile: path to a log file. + """ + return service.download_databases( + outdir, + pgap=pgap, + pfam=pfam, + unpack=unpack, + force=force, + logfile=logfile, + ) + + +def main() -> None: + """Console-script entry point: run the server over stdio.""" + import logging + import os + + # Quiet the per-request INFO chatter from the MCP runtime unless debugging. + if not os.environ.get("PYNTENY_MCP_DEBUG"): + logging.getLogger("mcp").setLevel(logging.WARNING) + mcp.run() + + +if __name__ == "__main__": + main() diff --git a/mcp/src/pynteny_mcp/service.py b/mcp/src/pynteny_mcp/service.py new file mode 100644 index 0000000..07cbdb4 --- /dev/null +++ b/mcp/src/pynteny_mcp/service.py @@ -0,0 +1,430 @@ +""" +Service layer for the Pynteny MCP server. + +Wraps the :mod:`pynteny` public API (``pynteny.api.Search / Build / Download`` +and the synteny-structure parsers) with helpers that: + +* keep the MCP **stdio channel clean** — Pynteny logs to ``stdout`` and several + helpers print there, which would corrupt the JSON-RPC stream, so every call + into Pynteny is wrapped in :func:`_pynteny_run` which redirects ``stdout`` to + ``stderr`` and isolates Pynteny's root-logger handlers, and +* turn the rich return values (``SyntenyHits`` / pandas ``DataFrame`` / written + output files) into compact, JSON-serialisable summaries. + +Everything here is transport-agnostic; ``server.py`` only adds the MCP tool +definitions on top. +""" + +from __future__ import annotations + +import contextlib +import logging +import os +import sys +from pathlib import Path +from typing import Any, Optional + + +def _log(msg: str) -> None: + # NEVER write to stdout: it is the MCP stdio channel. Logs go to stderr. + print(f"[pynteny-mcp] {msg}", file=sys.stderr, flush=True) + + +@contextlib.contextmanager +def _pynteny_run(): + """ + Run a block of Pynteny code without polluting the MCP stdio channel. + + Pynteny's ``init_logger`` calls ``logging.basicConfig`` with a + ``StreamHandler(sys.stdout)`` and several code paths ``print`` to stdout. The + MCP stdio transport uses stdout for protocol traffic, so any of that would + break the connection. We: + + * redirect ``sys.stdout`` to ``sys.stderr`` for the duration of the call + (so the StreamHandler Pynteny installs binds to stderr, and stray prints + go to stderr too), and + * detach the root logger's existing handlers first and restore them after, + so repeated calls don't accumulate handlers or reuse ones Pynteny closed + via ``logging.shutdown()``. + """ + root = logging.getLogger() + saved_handlers = root.handlers[:] + for h in saved_handlers: + root.removeHandler(h) + try: + with contextlib.redirect_stdout(sys.stderr): + yield + finally: + # Drop whatever handlers Pynteny installed (it may have closed them in + # logging.shutdown()), then restore the ones we started with. + for h in root.handlers[:]: + root.removeHandler(h) + for h in saved_handlers: + root.addHandler(h) + + +# --------------------------------------------------------------------------- # +# Config / metadata # +# --------------------------------------------------------------------------- # +def pynteny_info() -> dict[str, Any]: + """Version, author, citation and which HMM databases are configured.""" + from pynteny.api import Command, __author__, __version__ + from pynteny.utils import CommandArgs, ConfigParser + from pynteny.subcommands import get_citation + + citation = get_citation(CommandArgs(version=__version__, author=__author__), silent=True) + + config = ConfigParser.get_default_config() + db: dict[str, Any] = {} + for field in ( + "PGAP_data_downloaded", + "PFAM_data_downloaded", + "PGAP_database", + "PGAP_meta_file", + "PFAM_database", + "PFAM_meta_file", + "database_dir", + ): + try: + db[field] = config.get_field(field) + except Exception: + db[field] = None + return { + "version": __version__, + "author": __author__, + "citation": citation, + "config_file": str(config.get_config_path()), + "hmm_databases": db, + } + + +def _resolve_hmm_meta(hmm_meta: Optional[str]) -> Optional[str]: + """Fall back to the PGAP metadata file recorded in Pynteny's config.""" + if hmm_meta: + return hmm_meta + from pynteny.utils import ConfigParser + + config = ConfigParser.get_default_config() + for field in ("PGAP_meta_file", "PFAM_meta_file"): + try: + value = config.get_field(field) + except Exception: + value = None + if value: + return value + return None + + +def _resolve_hmm_dir(hmm_dir: Optional[str]) -> Optional[str]: + """Fall back to the PGAP HMM directory recorded in Pynteny's config.""" + if hmm_dir: + return hmm_dir + from pynteny.utils import ConfigParser + + config = ConfigParser.get_default_config() + for field in ("PGAP_database", "PFAM_database"): + try: + value = config.get_field(field) + except Exception: + value = None + if value: + return value + return None + + +# --------------------------------------------------------------------------- # +# Synteny-structure parsing (no I/O) # +# --------------------------------------------------------------------------- # +def validate_structure(synteny_structure: str) -> dict[str, Any]: + """Reformat, validate and decompose a synteny structure string. + + Pure string work — no HMMs, files or searches involved. Several Pynteny + parser helpers call ``sys.exit`` on malformed input, so we trap + ``SystemExit`` and report ``valid=False`` instead of killing the server. + """ + import pynteny.parsers.syntenyparser as sp + + reformatted = sp.reformat_synteny_structure(synteny_structure) + result: dict[str, Any] = { + "input": synteny_structure, + "reformatted": reformatted, + } + try: + with _pynteny_run(): + valid = sp.is_valid_structure(reformatted) + hmm_groups = sp.get_HMM_groups_in_structure(reformatted) + hmm_names = sp.get_all_HMMs_in_structure(reformatted) + strands = sp.get_strands_in_structure(reformatted) + distances = sp.get_maximum_distances_in_structure(reformatted) + except SystemExit: + result["valid"] = False + result["error"] = ( + "Could not parse the structure. Expected something like " + "'>hmm_a 0 >hmm_b 3 '/'<', integers = max genes " + "between neighbours)." + ) + return result + except Exception as exc: # noqa: BLE001 + result["valid"] = False + result["error"] = f"{type(exc).__name__}: {exc}" + return result + + result.update( + { + "valid": bool(valid), + "n_genes": len(hmm_groups), + "hmm_groups": hmm_groups, + "hmm_names": hmm_names, + "strands": strands, + "max_distances": distances, + "contains_hmm_groups": sp.contains_HMM_groups(reformatted), + } + ) + return result + + +def parse_gene_symbols(synteny_structure: str, hmm_meta: Optional[str]) -> dict[str, Any]: + """Translate a *gene-symbol* synteny structure into one based on HMM names, + using a PGAP/PFAM metadata table.""" + from pynteny.api import Search + + resolved_meta = _resolve_hmm_meta(hmm_meta) + if not resolved_meta: + return { + "error": ( + "No HMM metadata file available. Provide `hmm_meta`, or download " + "the PGAP database first with `download_hmm_databases`." + ) + } + if not Path(resolved_meta).exists(): + return {"error": f"HMM metadata file not found: {resolved_meta}"} + + search = Search(data=".", synteny_struc=synteny_structure, hmm_meta=resolved_meta) + try: + with _pynteny_run(): + translated = search.parse_genes(synteny_structure) + except SystemExit: + return { + "input": synteny_structure, + "hmm_meta": resolved_meta, + "error": ( + "One or more gene symbols did not match an HMM in the metadata " + "table. Check the gene symbols and the metadata file." + ), + } + return { + "input": synteny_structure, + "hmm_meta": resolved_meta, + "translated_structure": translated, + } + + +# --------------------------------------------------------------------------- # +# Build # +# --------------------------------------------------------------------------- # +def _count_fasta_records(path: Path) -> int: + n = 0 + try: + with open(path, "r", encoding="utf-8", errors="ignore") as fh: + for line in fh: + if line.startswith(">"): + n += 1 + except OSError: + return -1 + return n + + +def build_database( + data: str, + *, + outfile: Optional[str], + prepend: bool, + processes: Optional[int], + tempdir: Optional[str], + logfile: Optional[str], +) -> dict[str, Any]: + """Translate a nucleotide assembly (or GenBank file/dir) into a labelled + peptide database that synteny search can run on.""" + from pynteny.api import Build + + data_path = Path(data) + if not data_path.exists(): + return {"error": f"Input data not found: {data}"} + + build = Build( + data=data_path, + prepend=prepend, + outfile=outfile, + logfile=logfile, + processes=processes, + tempdir=tempdir, + ) + with _pynteny_run(): + build.run() + + out = Path(build._args.outfile) if build._args.outfile else None + result: dict[str, Any] = {"input": str(data_path)} + if out is not None and out.exists(): + result["output_file"] = str(out) + result["output_size_bytes"] = out.stat().st_size + result["n_peptides"] = _count_fasta_records(out) + else: + result["output_file"] = str(out) if out else None + result["warning"] = ( + "Build completed but the output file could not be located; " + "check the logfile." + ) + return result + + +# --------------------------------------------------------------------------- # +# Search # +# --------------------------------------------------------------------------- # +def run_search( + data: str, + synteny_structure: str, + *, + gene_ids: bool, + unordered: bool, + best_hmm_wins: bool, + reuse: bool, + hmm_dir: Optional[str], + hmm_meta: Optional[str], + outdir: Optional[str], + prefix: str, + hmmsearch_args: Optional[str], + processes: Optional[int], + logfile: Optional[str], + max_hits: int = 200, +) -> dict[str, Any]: + """Run a synteny-aware HMM search over a labelled peptide database and return + the matched hits plus the paths of the files Pynteny wrote.""" + from pynteny.api import Search + + data_path = Path(data) + if not data_path.exists(): + return {"error": f"Sequence database not found: {data}"} + + resolved_dir = _resolve_hmm_dir(hmm_dir) + if resolved_dir and not Path(resolved_dir).exists(): + return {"error": f"HMM directory not found: {resolved_dir}"} + if resolved_dir is None: + return { + "error": ( + "No HMM directory available. Provide `hmm_dir`, or download the " + "PGAP database first with `download_hmm_databases` (the search " + "would otherwise trigger a large download)." + ) + } + resolved_meta = _resolve_hmm_meta(hmm_meta) + + kwargs = dict( + data=data_path, + synteny_struc=synteny_structure, + gene_ids=gene_ids, + unordered=unordered, + reuse=reuse, + hmm_dir=resolved_dir, + hmm_meta=resolved_meta, + outdir=outdir, + prefix=prefix, + hmmsearch_args=hmmsearch_args, + logfile=logfile, + processes=processes, + ) + # `best_hmm_wins` was added after the published 1.2.0 release; pass it only + # when the installed Pynteny supports it so this works on both. + import inspect + + best_hmm_wins_supported = "best_hmm_wins" in inspect.signature(Search.__init__).parameters + if best_hmm_wins_supported: + kwargs["best_hmm_wins"] = best_hmm_wins + + search = Search(**kwargs) + + with _pynteny_run(): + synteny_hits = search.run() + if resolved_meta and Path(resolved_meta).exists(): + annotated = synteny_hits.add_HMM_meta_info_to_hits(resolved_meta) + # Depending on the Pynteny version this returns either a SyntenyHits + # or the underlying DataFrame directly. + df = annotated.hits if hasattr(annotated, "hits") else annotated + else: + df = synteny_hits.hits + + out_dir = Path(search._args.outdir) + records = df.where(df.notna(), None).to_dict(orient="records") + fasta_files = sorted(str(p) for p in out_dir.glob(f"{prefix}*_hits.fasta")) + synteny_table = out_dir / f"{prefix}synteny_matched.tsv" + + result: dict[str, Any] = { + "synteny_structure": search._args.synteny_struc, + "data": str(data_path), + "n_hits": int(len(df)), + "columns": list(df.columns), + "hits": records[:max_hits], + "hits_truncated": len(records) > max_hits, + "output_dir": str(out_dir), + "synteny_table": str(synteny_table) if synteny_table.exists() else None, + "fasta_files": fasta_files, + } + if best_hmm_wins and not best_hmm_wins_supported: + result["warning"] = ( + "best_hmm_wins was requested but the installed Pynteny does not " + "support it; the option was ignored." + ) + return result + + +# --------------------------------------------------------------------------- # +# Download # +# --------------------------------------------------------------------------- # +def download_databases( + outdir: str, + *, + pgap: bool, + pfam: bool, + unpack: bool, + force: bool, + logfile: Optional[str], +) -> dict[str, Any]: + """Download PGAP and/or PFAM profile-HMM databases from NCBI / InterPro. + + The :class:`pynteny.api.Download` wrapper hard-wires PGAP only, so we drive + the underlying ``download_hmms`` subcommand directly to expose the PFAM + option too. + """ + from pynteny.subcommands import download_hmms + from pynteny.utils import CommandArgs, ConfigParser + + if not (pgap or pfam): + return {"error": "Select at least one of pgap / pfam to download."} + + args = CommandArgs( + outdir=Path(outdir), + logfile=Path(logfile) if logfile else None, + force=force, + unpack=unpack, + pgap=pgap, + pfam=pfam, + ) + try: + with _pynteny_run(): + download_hmms(args) + except SystemExit: + # download_hmms exits(1) when the requested databases are already present. + pass + + config = ConfigParser.get_default_config() + return { + "outdir": str(Path(outdir).absolute()), + "requested": {"pgap": pgap, "pfam": pfam}, + "hmm_databases": { + "PGAP_data_downloaded": config.get_field("PGAP_data_downloaded"), + "PFAM_data_downloaded": config.get_field("PFAM_data_downloaded"), + "PGAP_database": config.get_field("PGAP_database"), + "PGAP_meta_file": config.get_field("PGAP_meta_file"), + "PFAM_database": config.get_field("PFAM_database"), + "PFAM_meta_file": config.get_field("PFAM_meta_file"), + }, + } diff --git a/mcp/tests/smoke_test.py b/mcp/tests/smoke_test.py new file mode 100644 index 0000000..8810aab --- /dev/null +++ b/mcp/tests/smoke_test.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +""" +No-API-key smoke test for the Pynteny MCP server. + +Launches the server over stdio, connects as an MCP client, and exercises the +tools end to end against Pynteny's own committed test data +(``tests/test_data/MG1655.fasta`` + the ``hmms/`` directory), asserting on the +known synteny hits for the *leu* operon. No LLM and no API keys are involved — +this validates the MCP plumbing and the tool logic, including that Pynteny's +stdout logging does not corrupt the stdio protocol. + + python tests/smoke_test.py +""" + +from __future__ import annotations + +import asyncio +import json +import os +import sys +import tempfile +from datetime import timedelta +from pathlib import Path + +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client + +HERE = Path(__file__).resolve().parent +MCP_DIR = HERE.parent +SRC_DIR = MCP_DIR / "src" +REPO_ROOT = MCP_DIR.parent +TEST_DATA = REPO_ROOT / "tests" / "test_data" + +# The leu-operon synteny structure used by Pynteny's own integration test. +SYNTENY_STRUC = ( + "<(TIGR00171.1|TIGR02084.1) 0 " + "<(TIGR00170.1|TIGR02083.1) 1 " + "<(TIGR00973.1|NF002084.0|TIGR00970.1)" +) +EXPECTED_LABELS = { + "b0071__U00096_71_78847_79453_neg", + "b0072__U00096_72_79463_80864_neg", + "b0074__U00096_74_81957_83529_neg", +} + +TIMEOUT = timedelta(seconds=300) + + +async def call(session: ClientSession, name: str, **arguments): + result = await session.call_tool(name, arguments, read_timeout_seconds=TIMEOUT) + assert not getattr(result, "isError", False), f"{name} returned an error: {result.content}" + text = "\n".join(getattr(b, "text", "") for b in result.content if getattr(b, "text", "")) + return json.loads(text) + + +def check(label: str, condition: bool, detail: str = "") -> None: + status = "PASS" if condition else "FAIL" + print(f" [{status}] {label}" + (f" — {detail}" if detail else "")) + if not condition: + raise AssertionError(f"{label}: {detail}") + + +async def main() -> None: + data = TEST_DATA / "MG1655.fasta" + hmm_dir = TEST_DATA / "hmms" + hmm_meta = TEST_DATA / "hmm_meta.tsv" + for p in (data, hmm_dir, hmm_meta): + if not p.exists(): + sys.exit(f"Test data not found: {p}") + + server_params = StdioServerParameters( + command=sys.executable, + args=["-m", "pynteny_mcp.server"], + env={ + "PYTHONPATH": str(SRC_DIR), + "PATH": os.environ.get("PATH", ""), + }, + ) + + with tempfile.TemporaryDirectory() as outdir: + async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + tools = (await session.list_tools()).tools + print(f"Server exposes {len(tools)} tools: {', '.join(t.name for t in tools)}\n") + check("expected tool count", len(tools) == 6, f"got {len(tools)}") + + info = await call(session, "get_pynteny_info") + check("pynteny version reported", bool(info.get("version")), str(info.get("version"))) + check("citation present", "Pynteny" in info.get("citation", ""), "") + + good = await call(session, "validate_synteny_structure", + synteny_structure=SYNTENY_STRUC) + check("valid structure recognised", good["valid"] is True, str(good.get("valid"))) + check("structure has 3 genes", good["n_genes"] == 3, str(good.get("n_genes"))) + check("max distances parsed", good["max_distances"] == [0, 1], + str(good.get("max_distances"))) + check("strands parsed", good["strands"] == ["neg", "neg", "neg"], + str(good.get("strands"))) + + # Two adjacent genes with no distance token between them: the + # gene/distance counts don't line up, so this is malformed. + bad = await call(session, "validate_synteny_structure", + synteny_structure=">TIGR00171.1 >TIGR00170.1") + check("invalid structure rejected", bad["valid"] is False, str(bad.get("valid"))) + + hits = await call( + session, "run_synteny_search", + data=str(data), + synteny_structure=SYNTENY_STRUC, + hmm_dir=str(hmm_dir), + hmm_meta=str(hmm_meta), + reuse=True, + outdir=outdir, + ) + check("search returned 3 hits", hits["n_hits"] == 3, str(hits.get("n_hits"))) + labels = {h.get("full_label") for h in hits["hits"]} + check("hit labels match expected leu operon", labels == EXPECTED_LABELS, + str(labels)) + check("hits annotated with gene_symbol", "gene_symbol" in hits["columns"], + str(hits["columns"])) + check("wrote synteny_matched.tsv", bool(hits.get("synteny_table")), + str(hits.get("synteny_table"))) + check("wrote per-gene FASTA files", len(hits.get("fasta_files", [])) >= 1, + str(hits.get("fasta_files"))) + + # best_hmm_wins: must still return the leu hits. On a Pynteny old + # enough to lack the option the service reports a `warning` and + # carries on; on a new-enough Pynteny it is honored (no warning). + bhw = await call( + session, "run_synteny_search", + data=str(data), + synteny_structure=SYNTENY_STRUC, + hmm_dir=str(hmm_dir), + hmm_meta=str(hmm_meta), + reuse=True, + best_hmm_wins=True, + outdir=outdir, + ) + check("best_hmm_wins search still returns 3 hits", bhw["n_hits"] == 3, + str(bhw.get("n_hits"))) + if bhw.get("warning"): + print(f" [NOTE] best_hmm_wins not honored by installed Pynteny: " + f"{bhw['warning']}") + else: + print(" [NOTE] best_hmm_wins honored by installed Pynteny.") + + print("\nAll smoke-test checks passed.") + + +if __name__ == "__main__": + asyncio.run(main()) From 17c1a98c83296b01c6de74841131b61a2e9c06d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semid=C3=A1n=20Robaina=20Est=C3=A9vez?= Date: Fri, 19 Jun 2026 14:02:30 +0100 Subject: [PATCH 2/2] Format mcp/ with black to satisfy CI quality check Co-Authored-By: Claude Opus 4.8 --- mcp/examples/synteny_search_agent.py | 30 +++++-- mcp/src/pynteny_mcp/service.py | 12 ++- mcp/tests/smoke_test.py | 117 ++++++++++++++++++++------- 3 files changed, 119 insertions(+), 40 deletions(-) diff --git a/mcp/examples/synteny_search_agent.py b/mcp/examples/synteny_search_agent.py index 831e308..c3dfb9d 100644 --- a/mcp/examples/synteny_search_agent.py +++ b/mcp/examples/synteny_search_agent.py @@ -98,7 +98,9 @@ async def call(self, name: str, arguments: dict) -> str: result = await self.session.call_tool( name, arguments, read_timeout_seconds=TOOL_TIMEOUT ) - parts = [getattr(b, "text", "") for b in result.content if getattr(b, "text", "")] + parts = [ + getattr(b, "text", "") for b in result.content if getattr(b, "text", "") + ] text = "\n".join(parts).strip() if not text and getattr(result, "structuredContent", None): text = json.dumps(result.structuredContent) @@ -108,7 +110,11 @@ async def call(self, name: str, arguments: dict) -> str: def anthropic_tools(self) -> list[dict]: return [ - {"name": t.name, "description": t.description or "", "input_schema": t.inputSchema} + { + "name": t.name, + "description": t.description or "", + "input_schema": t.inputSchema, + } for t in self.tools ] @@ -210,7 +216,10 @@ async def run_deepseek(toolbox: MCPToolbox, question: str) -> str: { "id": tc.id, "type": "function", - "function": {"name": tc.function.name, "arguments": tc.function.arguments}, + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, } for tc in msg.tool_calls ] @@ -249,7 +258,9 @@ async def main_async(provider: str, question: str) -> None: toolbox = MCPToolbox(session) await toolbox.load() - print(f"Connected to Pynteny MCP server: {len(toolbox.tools)} tools available") + print( + f"Connected to Pynteny MCP server: {len(toolbox.tools)} tools available" + ) print(f"Provider: {provider}\n") print(f"Question:\n {question}\n") print("--- agent trace ---") @@ -259,7 +270,9 @@ async def main_async(provider: str, question: str) -> None: answer = await run_claude(toolbox, question) else: answer = await run_deepseek(toolbox, question) - except Exception as exc: # noqa: BLE001 — surface a clean message, not a stack trace + except ( + Exception + ) as exc: # noqa: BLE001 — surface a clean message, not a stack trace print(f"\nLLM call failed ({type(exc).__name__}): {exc}") return @@ -270,11 +283,14 @@ async def main_async(provider: str, question: str) -> None: def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - "--provider", choices=["claude", "deepseek"], default="claude", + "--provider", + choices=["claude", "deepseek"], + default="claude", help="Which LLM backend to use (default: claude).", ) parser.add_argument( - "--question", default=DEFAULT_QUESTION, + "--question", + default=DEFAULT_QUESTION, help="Question to ask the agent (defaults to the leu-operon example).", ) args = parser.parse_args() diff --git a/mcp/src/pynteny_mcp/service.py b/mcp/src/pynteny_mcp/service.py index 07cbdb4..3d2d37b 100644 --- a/mcp/src/pynteny_mcp/service.py +++ b/mcp/src/pynteny_mcp/service.py @@ -72,7 +72,9 @@ def pynteny_info() -> dict[str, Any]: from pynteny.utils import CommandArgs, ConfigParser from pynteny.subcommands import get_citation - citation = get_citation(CommandArgs(version=__version__, author=__author__), silent=True) + citation = get_citation( + CommandArgs(version=__version__, author=__author__), silent=True + ) config = ConfigParser.get_default_config() db: dict[str, Any] = {} @@ -183,7 +185,9 @@ def validate_structure(synteny_structure: str) -> dict[str, Any]: return result -def parse_gene_symbols(synteny_structure: str, hmm_meta: Optional[str]) -> dict[str, Any]: +def parse_gene_symbols( + synteny_structure: str, hmm_meta: Optional[str] +) -> dict[str, Any]: """Translate a *gene-symbol* synteny structure into one based on HMM names, using a PGAP/PFAM metadata table.""" from pynteny.api import Search @@ -336,7 +340,9 @@ def run_search( # when the installed Pynteny supports it so this works on both. import inspect - best_hmm_wins_supported = "best_hmm_wins" in inspect.signature(Search.__init__).parameters + best_hmm_wins_supported = ( + "best_hmm_wins" in inspect.signature(Search.__init__).parameters + ) if best_hmm_wins_supported: kwargs["best_hmm_wins"] = best_hmm_wins diff --git a/mcp/tests/smoke_test.py b/mcp/tests/smoke_test.py index 8810aab..6ef9341 100644 --- a/mcp/tests/smoke_test.py +++ b/mcp/tests/smoke_test.py @@ -48,8 +48,12 @@ async def call(session: ClientSession, name: str, **arguments): result = await session.call_tool(name, arguments, read_timeout_seconds=TIMEOUT) - assert not getattr(result, "isError", False), f"{name} returned an error: {result.content}" - text = "\n".join(getattr(b, "text", "") for b in result.content if getattr(b, "text", "")) + assert not getattr( + result, "isError", False + ), f"{name} returned an error: {result.content}" + text = "\n".join( + getattr(b, "text", "") for b in result.content if getattr(b, "text", "") + ) return json.loads(text) @@ -82,30 +86,61 @@ async def main() -> None: async with ClientSession(read, write) as session: await session.initialize() tools = (await session.list_tools()).tools - print(f"Server exposes {len(tools)} tools: {', '.join(t.name for t in tools)}\n") + print( + f"Server exposes {len(tools)} tools: {', '.join(t.name for t in tools)}\n" + ) check("expected tool count", len(tools) == 6, f"got {len(tools)}") info = await call(session, "get_pynteny_info") - check("pynteny version reported", bool(info.get("version")), str(info.get("version"))) + check( + "pynteny version reported", + bool(info.get("version")), + str(info.get("version")), + ) check("citation present", "Pynteny" in info.get("citation", ""), "") - good = await call(session, "validate_synteny_structure", - synteny_structure=SYNTENY_STRUC) - check("valid structure recognised", good["valid"] is True, str(good.get("valid"))) - check("structure has 3 genes", good["n_genes"] == 3, str(good.get("n_genes"))) - check("max distances parsed", good["max_distances"] == [0, 1], - str(good.get("max_distances"))) - check("strands parsed", good["strands"] == ["neg", "neg", "neg"], - str(good.get("strands"))) + good = await call( + session, + "validate_synteny_structure", + synteny_structure=SYNTENY_STRUC, + ) + check( + "valid structure recognised", + good["valid"] is True, + str(good.get("valid")), + ) + check( + "structure has 3 genes", + good["n_genes"] == 3, + str(good.get("n_genes")), + ) + check( + "max distances parsed", + good["max_distances"] == [0, 1], + str(good.get("max_distances")), + ) + check( + "strands parsed", + good["strands"] == ["neg", "neg", "neg"], + str(good.get("strands")), + ) # Two adjacent genes with no distance token between them: the # gene/distance counts don't line up, so this is malformed. - bad = await call(session, "validate_synteny_structure", - synteny_structure=">TIGR00171.1 >TIGR00170.1") - check("invalid structure rejected", bad["valid"] is False, str(bad.get("valid"))) + bad = await call( + session, + "validate_synteny_structure", + synteny_structure=">TIGR00171.1 >TIGR00170.1", + ) + check( + "invalid structure rejected", + bad["valid"] is False, + str(bad.get("valid")), + ) hits = await call( - session, "run_synteny_search", + session, + "run_synteny_search", data=str(data), synteny_structure=SYNTENY_STRUC, hmm_dir=str(hmm_dir), @@ -113,22 +148,39 @@ async def main() -> None: reuse=True, outdir=outdir, ) - check("search returned 3 hits", hits["n_hits"] == 3, str(hits.get("n_hits"))) + check( + "search returned 3 hits", + hits["n_hits"] == 3, + str(hits.get("n_hits")), + ) labels = {h.get("full_label") for h in hits["hits"]} - check("hit labels match expected leu operon", labels == EXPECTED_LABELS, - str(labels)) - check("hits annotated with gene_symbol", "gene_symbol" in hits["columns"], - str(hits["columns"])) - check("wrote synteny_matched.tsv", bool(hits.get("synteny_table")), - str(hits.get("synteny_table"))) - check("wrote per-gene FASTA files", len(hits.get("fasta_files", [])) >= 1, - str(hits.get("fasta_files"))) + check( + "hit labels match expected leu operon", + labels == EXPECTED_LABELS, + str(labels), + ) + check( + "hits annotated with gene_symbol", + "gene_symbol" in hits["columns"], + str(hits["columns"]), + ) + check( + "wrote synteny_matched.tsv", + bool(hits.get("synteny_table")), + str(hits.get("synteny_table")), + ) + check( + "wrote per-gene FASTA files", + len(hits.get("fasta_files", [])) >= 1, + str(hits.get("fasta_files")), + ) # best_hmm_wins: must still return the leu hits. On a Pynteny old # enough to lack the option the service reports a `warning` and # carries on; on a new-enough Pynteny it is honored (no warning). bhw = await call( - session, "run_synteny_search", + session, + "run_synteny_search", data=str(data), synteny_structure=SYNTENY_STRUC, hmm_dir=str(hmm_dir), @@ -137,11 +189,16 @@ async def main() -> None: best_hmm_wins=True, outdir=outdir, ) - check("best_hmm_wins search still returns 3 hits", bhw["n_hits"] == 3, - str(bhw.get("n_hits"))) + check( + "best_hmm_wins search still returns 3 hits", + bhw["n_hits"] == 3, + str(bhw.get("n_hits")), + ) if bhw.get("warning"): - print(f" [NOTE] best_hmm_wins not honored by installed Pynteny: " - f"{bhw['warning']}") + print( + f" [NOTE] best_hmm_wins not honored by installed Pynteny: " + f"{bhw['warning']}" + ) else: print(" [NOTE] best_hmm_wins honored by installed Pynteny.")