Two issues mask a model-name mismatch as an opaque indexing bug:
-
create_remote_inference_engines_from_config hardcodes model_name=cfg.trainer.policy.model.path (e.g. Qwen/Qwen3.6-35B-A3B), ignoring generator.inference_engine.served_model_name. If vLLM was started with --served-model-name <basename> (e.g. Qwen3.6-35B-A3B), the request is rejected with HTTP 404 ({"error":{"type":"NotFoundError","message":"The model ... does not exist.","code":404}}). Note that there's a # TODO for this hiccup already:
# TODO(tgriggs): We may want a separate config for the model name in case
# it's different from the name used in the OpenAI API
-
RemoteInferenceEngine.generate never checks the status code: it parses the 404 body via response.get("choices", []), returns responses=[], and InferenceEngineClient.generate then crashes with an IndexError at L147: responses[original_idx] = result["responses"][local_idx].
Repro
import argparse
import asyncio
import contextlib
import json
import os
import signal
import subprocess
import sys
import time
import urllib.error
import urllib.request
from urllib.parse import urlparse
import aiohttp
from skyrl.backends.skyrl_train.inference_engines.base import InferenceEngineInput
from skyrl.backends.skyrl_train.inference_engines.inference_engine_client import (
InferenceEngineClient,
)
from skyrl.backends.skyrl_train.inference_engines.remote_inference_engine import (
RemoteInferenceEngine,
)
from skyrl.train.config import InferenceEngineConfig, SkyRLLoraConfig
from transformers import AutoTokenizer
def start_vllm_server(
model: str, served_name: str, host: str, port: int
) -> subprocess.Popen:
"""Spawn `vllm serve` in its own process group so we can kill the tree."""
cmd = [
"vllm",
"serve",
model,
"--served-model-name",
served_name,
"--host",
host,
"--port",
str(port),
"--enforce-eager",
"--no-enable-flashinfer-autotune",
"--gpu-memory-utilization",
"0.5",
"--max-model-len",
"4096",
]
print(f"[repro] starting: {' '.join(cmd)}")
return subprocess.Popen(cmd, start_new_session=True)
def wait_for_health(url: str, proc: subprocess.Popen, timeout_s: float = 300.0) -> None:
deadline = time.monotonic() + timeout_s
while time.monotonic() < deadline:
if proc.poll() is not None:
raise RuntimeError(f"vLLM exited early with rc={proc.returncode}")
try:
with urllib.request.urlopen(f"{url}/health", timeout=2) as resp:
if resp.status == 200:
return
except (urllib.error.URLError, ConnectionError, TimeoutError):
pass
time.sleep(2)
raise TimeoutError(f"vLLM did not become ready within {timeout_s:.0f}s at {url}")
def shutdown_vllm(proc: subprocess.Popen) -> None:
if proc.poll() is not None:
return
print(f"[repro] terminating vLLM (pid={proc.pid})")
with contextlib.suppress(ProcessLookupError):
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
try:
proc.wait(timeout=30)
except subprocess.TimeoutExpired:
with contextlib.suppress(ProcessLookupError):
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
async def fetch_raw_404(url: str, model_name: str) -> tuple[int, str]:
"""Direct curl-equivalent so the operator sees the body SkyRL's generate() swallows."""
async with (
aiohttp.ClientSession() as session,
session.post(
f"{url}/v1/completions",
json={"model": model_name, "prompt": [[1, 2, 3]], "max_tokens": 4},
headers={"Content-Type": "application/json"},
) as resp,
):
return resp.status, await resp.text()
async def run_repro(url: str, model_name: str) -> int:
parsed = urlparse(url)
host_port = parsed.netloc or parsed.path # RemoteInferenceEngine prepends http://
print(f"[repro] POSTing model={model_name!r} to {url}/v1/completions")
status, body = await fetch_raw_404(url, model_name)
print(f"[repro] direct POST status={status}, body (truncated 300 chars):")
print(f" {body[:300]}")
print(
"[repro] ^ this is the body SkyRL's RemoteInferenceEngine.generate() swallows "
"via response.get('choices', []).\n"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
engine = RemoteInferenceEngine(
url=host_port,
model_name=model_name,
engine_backend="vllm",
tokenizer=tokenizer,
)
client = InferenceEngineClient(
engines=[engine],
tokenizer=tokenizer,
model_path=model_name,
lora_cfg=SkyRLLoraConfig(),
inference_engine_cfg=InferenceEngineConfig(),
)
prompt_ids = tokenizer.encode("Hello, world.")
print(
f"[repro] calling client.generate() with one prompt of {len(prompt_ids)} tokens"
)
output = await client.generate(
InferenceEngineInput(
prompt_token_ids=[prompt_ids],
sampling_params={"max_tokens": 4},
)
)
# Unreachable on the mismatch path: the line above raises IndexError.
print(f"[repro] UNEXPECTED success: {json.dumps(output, default=str)[:200]}")
return 1
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--url",
default="http://127.0.0.1:8000",
help="Base URL of the vLLM server (no trailing /v1).",
)
parser.add_argument(
"--model-name",
default="Qwen/Qwen2.5-0.5B",
help=(
"Model name SkyRL will post in the `model` field. Pass the HF path "
"(with org prefix) to trigger the mismatch against a vLLM started "
"with `--served-model-name <basename>`."
),
)
parser.add_argument(
"--no-serve",
action="store_true",
help="Skip auto-booting vLLM; assume one is already running at --url.",
)
parser.add_argument(
"--boot-timeout",
type=float,
default=300.0,
help="Seconds to wait for vLLM /health (default 300).",
)
args = parser.parse_args()
vllm_proc: subprocess.Popen | None = None
try:
if not args.no_serve:
parsed = urlparse(args.url)
served_name = args.model_name.rsplit("/", 1)[-1]
vllm_proc = start_vllm_server(
model=args.model_name,
served_name=served_name,
host=parsed.hostname or "127.0.0.1",
port=parsed.port or 8000,
)
wait_for_health(args.url, vllm_proc, timeout_s=args.boot_timeout)
print(f"[repro] vLLM healthy at {args.url}\n")
return asyncio.run(run_repro(args.url, args.model_name))
finally:
if vllm_proc is not None:
shutdown_vllm(vllm_proc)
if __name__ == "__main__":
sys.exit(main())
Run that Python code with Python 3.12 on a host with at least one GPU with vllm==0.20.2, skyrl==0.2.0, and transformers==5.8.0 installed. It:
- Boots
vllm serve Qwen/Qwen2.5-0.5B --served-model-name Qwen2.5-0.5B.
- Constructs a
RemoteInferenceEngine with model_name="Qwen/Qwen2.5-0.5B" (HF-path form, mismatching the served name).
- Calls
.generate() once.
The script first prints the swallowed vLLM 404 body via a direct POST, then triggers the SkyRL path. Expected tail of output:
[repro] starting: vllm serve Qwen/Qwen2.5-0.5B --served-model-name Qwen2.5-0.5B --host 127.0.0.1 --port 8000 ...
[repro] vLLM healthy at http://127.0.0.1:8000
[repro] POSTing model='Qwen/Qwen2.5-0.5B' to http://127.0.0.1:8000/v1/completions
[repro] direct POST status=404, body (truncated 300 chars):
{"error":{"message":"The model `Qwen/Qwen2.5-0.5B` does not exist.","type":"NotFoundError","param":"model","code":404}}
[repro] ^ this is the body SkyRL's RemoteInferenceEngine.generate() swallows via response.get('choices', []).
[repro] calling client.generate() with one prompt of 4 tokens
[repro] terminating vLLM (pid=...)
Traceback (most recent call last):
...
File ".../skyrl/backends/skyrl_train/inference_engines/inference_engine_client.py", line 147, in generate
responses[original_idx] = result["responses"][local_idx]
IndexError: list index out of range
Suggested fixes
- In
create_remote_inference_engines_from_config: honor ie_cfg.served_model_name when set.
- In
RemoteInferenceEngine.generate: call resp.raise_for_status() (or branch on resp.status >= 400 and surface the body) before parsing JSON.
Also #784 is relevant when fixing.
Two issues mask a model-name mismatch as an opaque indexing bug:
create_remote_inference_engines_from_confighardcodesmodel_name=cfg.trainer.policy.model.path(e.g.Qwen/Qwen3.6-35B-A3B), ignoringgenerator.inference_engine.served_model_name. If vLLM was started with--served-model-name <basename>(e.g.Qwen3.6-35B-A3B), the request is rejected with HTTP 404 ({"error":{"type":"NotFoundError","message":"The model ... does not exist.","code":404}}). Note that there's a# TODOfor this hiccup already:RemoteInferenceEngine.generatenever checks the status code: it parses the 404 body viaresponse.get("choices", []), returnsresponses=[], andInferenceEngineClient.generatethen crashes with anIndexErrorat L147:responses[original_idx] = result["responses"][local_idx].Repro
Run that Python code with Python 3.12 on a host with at least one GPU with
vllm==0.20.2,skyrl==0.2.0, andtransformers==5.8.0installed. It:vllm serve Qwen/Qwen2.5-0.5B --served-model-name Qwen2.5-0.5B.RemoteInferenceEnginewithmodel_name="Qwen/Qwen2.5-0.5B"(HF-path form, mismatching the served name)..generate()once.The script first prints the swallowed vLLM 404 body via a direct POST, then triggers the SkyRL path. Expected tail of output:
Suggested fixes
create_remote_inference_engines_from_config: honorie_cfg.served_model_namewhen set.RemoteInferenceEngine.generate: callresp.raise_for_status()(or branch onresp.status >= 400and surface the body) before parsing JSON.Also #784 is relevant when fixing.